drbd: Load balancing of read requests
authorPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 11 Nov 2011 11:31:20 +0000 (12:31 +0100)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Thu, 8 Nov 2012 15:58:10 +0000 (16:58 +0100)
New config option for the disk secition "read-balancing", with
the values: prefer-local, prefer-remote, round-robin, when-congested-remote.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
include/linux/drbd.h
include/linux/drbd_genl.h
include/linux/drbd_limits.h

index d397681fb7aa1f91d105d8ffbf02525d08523f8e..e2cccb40f5af32f8a73ed83b9582f3961382c709 100644 (file)
@@ -698,6 +698,7 @@ enum {
        AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
        B_RS_H_DONE,            /* Before resync handler done (already executed) */
        DISCARD_MY_DATA,        /* discard_my_data flag per volume */
+       READ_BALANCE_RR,
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
index e546dd3fab8a8100a5380b37bb4fc11a6e6d8583..733b8bd663d5ae3ef58daa2c52ce519c9905837c 100644 (file)
@@ -4974,7 +4974,7 @@ static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
 
        update_peer_seq(mdev, be32_to_cpu(p->seq_num));
 
-       dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+       dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
            (unsigned long long)sector, be32_to_cpu(p->blksize));
 
        return validate_req_change_req_state(mdev, p->block_id, sector,
index ceb04a94aacecb26bb5e653a1e91963f08559255..98251e2a7fb78a0a1ddcf519253380d6028fedfb 100644 (file)
@@ -563,6 +563,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
                        atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
 
+               if (!(req->rq_state & RQ_WRITE) &&
+                   mdev->state.disk == D_UP_TO_DATE &&
+                   !IS_ERR_OR_NULL(req->private_bio))
+                       goto goto_read_retry_local;
+
                /* if it is still queued, we may not complete it here.
                 * it will be canceled soon. */
                if (!(req->rq_state & RQ_NET_QUEUED))
@@ -625,10 +630,22 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
 
                req->rq_state |= RQ_NET_DONE;
+
+               if (!(req->rq_state & RQ_WRITE) &&
+                   mdev->state.disk == D_UP_TO_DATE &&
+                   !IS_ERR_OR_NULL(req->private_bio))
+                       goto goto_read_retry_local;
+
                _req_may_be_done_not_susp(req, m);
                /* else: done by HANDED_OVER_TO_NETWORK */
                break;
 
+       goto_read_retry_local:
+               req->rq_state |= RQ_LOCAL_PENDING;
+               req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+               generic_make_request(req->private_bio);
+               break;
+
        case FAIL_FROZEN_DISK_IO:
                if (!(req->rq_state & RQ_LOCAL_COMPLETED))
                        break;
@@ -689,6 +706,11 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
                dec_ap_pending(mdev);
                req->rq_state &= ~RQ_NET_PENDING;
                req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
+               if (!IS_ERR_OR_NULL(req->private_bio)) {
+                       bio_put(req->private_bio);
+                       req->private_bio = NULL;
+                       put_ldev(mdev);
+               }
                _req_may_be_done_not_susp(req, m);
                break;
        };
@@ -723,6 +745,35 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int
        return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
 }
 
+static bool remote_due_to_read_balancing(struct drbd_conf *mdev)
+{
+       enum drbd_read_balancing rbm;
+       struct backing_dev_info *bdi;
+
+       if (mdev->state.pdsk < D_UP_TO_DATE)
+               return false;
+
+       rcu_read_lock();
+       rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+       rcu_read_unlock();
+
+       switch (rbm) {
+       case RB_CONGESTED_REMOTE:
+               bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+               return bdi_read_congested(bdi);
+       case RB_LEAST_PENDING:
+               return atomic_read(&mdev->local_cnt) >
+                       atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
+       case RB_ROUND_ROBIN:
+               return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
+       case RB_PREFER_REMOTE:
+               return true;
+       case RB_PREFER_LOCAL:
+       default:
+               return false;
+       }
+}
+
 /*
  * complete_conflicting_writes  -  wait for any conflicting write requests
  *
@@ -790,6 +841,10 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
                                bio_put(req->private_bio);
                                req->private_bio = NULL;
                                put_ldev(mdev);
+                       } else if (remote_due_to_read_balancing(mdev)) {
+                               /* Keep the private bio in case we need it
+                                  for a local retry */
+                               local = 0;
                        }
                }
                remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
@@ -1017,7 +1072,7 @@ fail_free_complete:
        if (req->rq_state & RQ_IN_ACT_LOG)
                drbd_al_complete_io(mdev, &req->i);
 fail_and_free_req:
-       if (local) {
+       if (!IS_ERR_OR_NULL(req->private_bio)) {
                bio_put(req->private_bio);
                req->private_bio = NULL;
                put_ldev(mdev);
index 1e9f754b66acf0beba8a4466cade5751aed12f6f..157ba3d74dc76542a96a1ed3a14618b8ecc2e68f 100644 (file)
@@ -102,6 +102,14 @@ enum drbd_on_congestion {
        OC_DISCONNECT,
 };
 
+enum drbd_read_balancing {
+       RB_PREFER_LOCAL,
+       RB_PREFER_REMOTE,
+       RB_ROUND_ROBIN,
+       RB_LEAST_PENDING,
+       RB_CONGESTED_REMOTE,
+};
+
 /* KEEP the order, do not delete or insert. Only append. */
 enum drbd_ret_code {
        ERR_CODE_BASE           = 100,
index 2e6cefefe5e53848e4f11aa3187c00503c29d590..826008f297fe6e670133b0ec906a22d72b3ad15e 100644 (file)
@@ -129,6 +129,7 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
        __flg_field_def(18, DRBD_GENLA_F_MANDATORY,     disk_drain, DRBD_DISK_DRAIN_DEF)
        __flg_field_def(19, DRBD_GENLA_F_MANDATORY,     md_flushes, DRBD_MD_FLUSHES_DEF)
        __u32_field_def(20,     DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
+       __u32_field_def(21,     0 /* OPTIONAL */,       read_balancing, DRBD_READ_BALANCING_DEF)
 )
 
 GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
index 6d0a24331ed2cd972c5125865069f221819de856..17ef66a5c114e737f1f86e1aa6225085a48edd1e 100644 (file)
 #define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
 #define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
 #define DRBD_ON_CONGESTION_DEF OC_BLOCK
+#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL
 
 #define DRBD_MAX_BIO_BVECS_MIN 0
 #define DRBD_MAX_BIO_BVECS_MAX 128