xen/blkfront: Handle non-indirect grant with 64KB pages

author Julien Grall <julien.grall@citrix.com>

Thu, 13 Aug 2015 12:13:35 +0000 (13:13 +0100)

committer Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>

Mon, 4 Jan 2016 17:21:25 +0000 (12:21 -0500)
author Julien Grall <julien.grall@citrix.com>
Thu, 13 Aug 2015 12:13:35 +0000 (13:13 +0100)
committer Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Mon, 4 Jan 2016 17:21:25 +0000 (12:21 -0500)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c

index 0b32c90ffc3ffcde734fecc6fe8cc6d9593b318b..f3d0d47586410fab0cf4ca9698054bc3104fd108 100644 (file)
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -60,6 +60,20 @@
  
  #include <asm/xen/hypervisor.h>
  
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
  enum blkif_state {
         BLKIF_STATE_DISCONNECTED,
         BLKIF_STATE_CONNECTED,
@@ -72,6 +86,13 @@ struct grant {
         struct list_head node;
  };
  
+enum blk_req_status {
+       REQ_WAITING,
+       REQ_DONE,
+       REQ_ERROR,
+       REQ_EOPNOTSUPP,
+};
+
  struct blk_shadow {
         struct blkif_request req;
         struct request *request;
@@ -79,6 +100,14 @@ struct blk_shadow {
         struct grant **indirect_grants;
         struct scatterlist *sg;
         unsigned int num_sg;
+       enum blk_req_status status;
+
+       #define NO_ASSOCIATED_ID ~0UL
+       /*
+        * Id of the sibling if we ever need 2 requests when handling a
+        * block I/O request
+        */
+       unsigned long associated_id;
  };
  
  struct split_bio {
@@ -492,6 +521,8 @@ static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
  
         id = get_id_from_freelist(rinfo);
         rinfo->shadow[id].request = req;
+       rinfo->shadow[id].status = REQ_WAITING;
+       rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
  
         (*ring_req)->u.rw.id = id;
  
@@ -533,6 +564,9 @@ struct setup_rw_req {
         bool need_copy;
         unsigned int bvec_off;
         char *bvec_data;
+
+       bool require_extra_req;
+       struct blkif_request *extra_ring_req;
  };
  
  static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
@@ -546,8 +580,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
         unsigned int grant_idx = setup->grant_idx;
         struct blkif_request *ring_req = setup->ring_req;
         struct blkfront_ring_info *rinfo = setup->rinfo;
+       /*
+        * We always use the shadow of the first request to store the list
+        * of grant associated to the block I/O request. This made the
+        * completion more easy to handle even if the block I/O request is
+        * split.
+        */
         struct blk_shadow *shadow = &rinfo->shadow[setup->id];
  
+       if (unlikely(setup->require_extra_req &&
+                    grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+               /*
+                * We are using the second request, setup grant_idx
+                * to be the index of the segment array.
+                */
+               grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               ring_req = setup->extra_ring_req;
+       }
+
         if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
             (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
                 if (setup->segments)
@@ -562,7 +612,11 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
  
         gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
         ref = gnt_list_entry->gref;
-       shadow->grants_used[grant_idx] = gnt_list_entry;
+       /*
+        * All the grants are stored in the shadow of the first
+        * request. Therefore we have to use the global index.
+        */
+       shadow->grants_used[setup->grant_idx] = gnt_list_entry;
  
         if (setup->need_copy) {
                 void *shared_data;
@@ -604,11 +658,31 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
         (setup->grant_idx)++;
  }
  
+static void blkif_setup_extra_req(struct blkif_request *first,
+                                 struct blkif_request *second)
+{
+       uint16_t nr_segments = first->u.rw.nr_segments;
+
+       /*
+        * The second request is only present when the first request uses
+        * all its segments. It's always the continuity of the first one.
+        */
+       first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       second->u.rw.sector_number = first->u.rw.sector_number +
+               (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+       second->u.rw.handle = first->u.rw.handle;
+       second->operation = first->operation;
+}
+
  static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
  {
         struct blkfront_info *info = rinfo->dev_info;
-       struct blkif_request *ring_req;
-       unsigned long id;
+       struct blkif_request *ring_req, *extra_ring_req = NULL;
+       unsigned long id, extra_id = NO_ASSOCIATED_ID;
+       bool require_extra_req = false;
         int i;
         struct setup_rw_req setup = {
                 .grant_idx = 0,
@@ -650,19 +724,19 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
         /* Fill out a communications ring structure. */
         id = blkif_ring_get_request(rinfo, req, &ring_req);
  
-       BUG_ON(info->max_indirect_segments == 0 &&
-              GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-       BUG_ON(info->max_indirect_segments &&
-              GREFS(req->nr_phys_segments) > info->max_indirect_segments);
-
         num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
         num_grant = 0;
         /* Calculate the number of grant used */
         for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
                num_grant += gnttab_count_grant(sg->offset, sg->length);
  
+       require_extra_req = info->max_indirect_segments == 0 &&
+               num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
         rinfo->shadow[id].num_sg = num_sg;
-       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+       if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+           likely(!require_extra_req)) {
                 /*
                  * The indirect operation can only be a BLKIF_OP_READ or
                  * BLKIF_OP_WRITE
@@ -702,10 +776,30 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
                         }
                 }
                 ring_req->u.rw.nr_segments = num_grant;
+               if (unlikely(require_extra_req)) {
+                       extra_id = blkif_ring_get_request(rinfo, req,
+                                                         &extra_ring_req);
+                       /*
+                        * Only the first request contains the scatter-gather
+                        * list.
+                        */
+                       rinfo->shadow[extra_id].num_sg = 0;
+
+                       blkif_setup_extra_req(ring_req, extra_ring_req);
+
+                       /* Link the 2 requests together */
+                       rinfo->shadow[extra_id].associated_id = id;
+                       rinfo->shadow[id].associated_id = extra_id;
+               }
         }
  
         setup.ring_req = ring_req;
         setup.id = id;
+
+       setup.require_extra_req = require_extra_req;
+       if (unlikely(require_extra_req))
+               setup.extra_ring_req = extra_ring_req;
+
         for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
                 BUG_ON(sg->offset + sg->length > PAGE_SIZE);
  
@@ -728,6 +822,8 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri
  
         /* Keep a private copy so we can reissue requests when recovering. */
         rinfo->shadow[id].req = *ring_req;
+       if (unlikely(require_extra_req))
+               rinfo->shadow[extra_id].req = *extra_ring_req;
  
         if (max_grefs > 0)
                 gnttab_free_grant_references(setup.gref_head);
@@ -829,7 +925,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
         memset(&info->tag_set, 0, sizeof(info->tag_set));
         info->tag_set.ops = &blkfront_mq_ops;
         info->tag_set.nr_hw_queues = info->nr_rings;
-       info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+       if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+               /*
+                * When indirect descriptior is not supported, the I/O request
+                * will be split between multiple request in the ring.
+                * To avoid problems when sending the request, divide by
+                * 2 the depth of the queue.
+                */
+               info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+       } else
+               info->tag_set.queue_depth = BLK_RING_SIZE(info);
         info->tag_set.numa_node = NUMA_NO_NODE;
         info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
         info->tag_set.cmd_size = 0;
@@ -1269,20 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
         kunmap_atomic(shared_data);
  }
  
-static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *rinfo,
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+       switch (rsp)
+       {
+       case BLKIF_RSP_OKAY:
+               return REQ_DONE;
+       case BLKIF_RSP_EOPNOTSUPP:
+               return REQ_EOPNOTSUPP;
+       case BLKIF_RSP_ERROR:
+               /* Fallthrough. */
+       default:
+               return REQ_ERROR;
+       }
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+                                 enum blk_req_status s2)
+{
+       BUG_ON(s1 == REQ_WAITING);
+       BUG_ON(s2 == REQ_WAITING);
+
+       if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+               return BLKIF_RSP_ERROR;
+       else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+               return BLKIF_RSP_EOPNOTSUPP;
+       return BLKIF_RSP_OKAY;
+}
+
+static bool blkif_completion(unsigned long *id,
+                            struct blkfront_ring_info *rinfo,
                              struct blkif_response *bret)
  {
         int i = 0;
         struct scatterlist *sg;
         int num_sg, num_grant;
         struct blkfront_info *info = rinfo->dev_info;
+       struct blk_shadow *s = &rinfo->shadow[*id];
         struct copy_from_grant data = {
-               .s = s,
                 .grant_idx = 0,
         };
  
         num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
                 s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+       /* The I/O request may be split in two. */
+       if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+               struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+               /* Keep the status of the current response in shadow. */
+               s->status = blkif_rsp_to_req_status(bret->status);
+
+               /* Wait the second response if not yet here. */
+               if (s2->status == REQ_WAITING)
+                       return 0;
+
+               bret->status = blkif_get_final_status(s->status,
+                                                     s2->status);
+
+               /*
+                * All the grants is stored in the first shadow in order
+                * to make the completion code simpler.
+                */
+               num_grant += s2->req.u.rw.nr_segments;
+
+               /*
+                * The two responses may not come in order. Only the
+                * first request will store the scatter-gather list.
+                */
+               if (s2->num_sg != 0) {
+                       /* Update "id" with the ID of the first response. */
+                       *id = s->associated_id;
+                       s = s2;
+               }
+
+               /*
+                * We don't need anymore the second request, so recycling
+                * it now.
+                */
+               if (add_id_to_freelist(rinfo, s->associated_id))
+                       WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+                            info->gd->disk_name, s->associated_id);
+       }
+
+       data.s = s;
         num_sg = s->num_sg;
  
         if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
@@ -1352,6 +1530,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_ring_info *ri
                         }
                 }
         }
+
+       return 1;
  }
  
  static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1391,8 +1571,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
                 }
                 req  = rinfo->shadow[id].request;
  
-               if (bret->operation != BLKIF_OP_DISCARD)
-                       blkif_completion(&rinfo->shadow[id], rinfo, bret);
+               if (bret->operation != BLKIF_OP_DISCARD) {
+                       /*
+                        * We may need to wait for an extra response if the
+                        * I/O request is split in 2
+                        */
+                       if (!blkif_completion(&id, rinfo, bret))
+                               continue;
+               }
  
                 if (add_id_to_freelist(rinfo, id)) {
                         WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
@@ -2017,8 +2203,18 @@ static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
         int err, i;
         struct blkfront_info *info = rinfo->dev_info;
  
-       if (info->max_indirect_segments == 0)
-               grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+       if (info->max_indirect_segments == 0) {
+               if (!HAS_EXTRA_REQ)
+                       grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+               else {
+                       /*
+                        * When an extra req is required, the maximum
+                        * grants supported is related to the size of the
+                        * Linux block segment.
+                        */
+                       grants = GRANTS_PER_PSEG;
+               }
+       }
         else
                 grants = info->max_indirect_segments;
         psegs = grants / GRANTS_PER_PSEG;
author	Julien Grall <julien.grall@citrix.com>
	Thu, 13 Aug 2015 12:13:35 +0000 (13:13 +0100)
committer	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
	Mon, 4 Jan 2016 17:21:25 +0000 (12:21 -0500)