dm mpath: eliminate use of spinlock in IO fast-paths
authorMike Snitzer <snitzer@redhat.com>
Thu, 17 Mar 2016 22:38:17 +0000 (18:38 -0400)
committerMike Snitzer <snitzer@redhat.com>
Thu, 5 May 2016 19:25:52 +0000 (15:25 -0400)
The primary motivation of this commit is to improve the scalability of
DM multipath on large NUMA systems where m->lock spinlock contention has
been proven to be a serious bottleneck on really fast storage.

The ability to atomically read a pointer, using lockless_dereference(),
is leveraged in this commit.  But all pointer writes are still protected
by the m->lock spinlock (which is fine since these all now occur in the
slow-path).

The following functions no longer require the m->lock spinlock in their
fast-path: multipath_busy(), __multipath_map(), and do_end_io()

And choose_pgpath() is modified to _not_ update m->current_pgpath unless
it also switches the path-group.  This is done to avoid needing to take
the m->lock everytime __multipath_map() calls choose_pgpath().
But m->current_pgpath will be reset if it is failed via fail_path().

Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
drivers/md/dm-mpath.c

index 54daf96980c27d07b432f3157e639941084a1078..52baf8a5b0f47d8a130c9baf48cc9eacdcb4b05c 100644 (file)
@@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m)
        return atomic_read(&m->pg_init_in_progress);
 }
 
-static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
+static int pg_init_all_paths(struct multipath *m)
 {
-       m->current_pg = pgpath->pg;
+       int r;
+       unsigned long flags;
+
+       spin_lock_irqsave(&m->lock, flags);
+       r = __pg_init_all_paths(m);
+       spin_unlock_irqrestore(&m->lock, flags);
+
+       return r;
+}
+
+static void __switch_pg(struct multipath *m, struct priority_group *pg)
+{
+       m->current_pg = pg;
 
        /* Must we initialise the PG first, and queue I/O till it's ready? */
        if (m->hw_handler_name) {
@@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
        atomic_set(&m->pg_init_count, 0);
 }
 
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
-                              size_t nr_bytes)
+static struct pgpath *choose_path_in_pg(struct multipath *m,
+                                       struct priority_group *pg,
+                                       size_t nr_bytes)
 {
+       unsigned long flags;
        struct dm_path *path;
+       struct pgpath *pgpath;
 
        path = pg->ps.type->select_path(&pg->ps, nr_bytes);
        if (!path)
-               return -ENXIO;
+               return ERR_PTR(-ENXIO);
 
-       m->current_pgpath = path_to_pgpath(path);
+       pgpath = path_to_pgpath(path);
 
-       if (m->current_pg != pg)
-               __switch_pg(m, m->current_pgpath);
+       if (unlikely(lockless_dereference(m->current_pg) != pg)) {
+               /* Only update current_pgpath if pg changed */
+               spin_lock_irqsave(&m->lock, flags);
+               m->current_pgpath = pgpath;
+               __switch_pg(m, pg);
+               spin_unlock_irqrestore(&m->lock, flags);
+       }
 
-       return 0;
+       return pgpath;
 }
 
-static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
+static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
+       unsigned long flags;
        struct priority_group *pg;
+       struct pgpath *pgpath;
        bool bypassed = true;
 
        if (!atomic_read(&m->nr_valid_paths)) {
@@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
        }
 
        /* Were we instructed to switch PG? */
-       if (m->next_pg) {
+       if (lockless_dereference(m->next_pg)) {
+               spin_lock_irqsave(&m->lock, flags);
                pg = m->next_pg;
+               if (!pg) {
+                       spin_unlock_irqrestore(&m->lock, flags);
+                       goto check_current_pg;
+               }
                m->next_pg = NULL;
-               if (!__choose_path_in_pg(m, pg, nr_bytes))
-                       return;
+               spin_unlock_irqrestore(&m->lock, flags);
+               pgpath = choose_path_in_pg(m, pg, nr_bytes);
+               if (!IS_ERR_OR_NULL(pgpath))
+                       return pgpath;
        }
 
        /* Don't change PG until it has no remaining paths */
-       if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
-               return;
+check_current_pg:
+       pg = lockless_dereference(m->current_pg);
+       if (pg) {
+               pgpath = choose_path_in_pg(m, pg, nr_bytes);
+               if (!IS_ERR_OR_NULL(pgpath))
+                       return pgpath;
+       }
 
        /*
         * Loop through priority groups until we find a valid path.
@@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
                list_for_each_entry(pg, &m->priority_groups, list) {
                        if (pg->bypassed == bypassed)
                                continue;
-                       if (!__choose_path_in_pg(m, pg, nr_bytes)) {
+                       pgpath = choose_path_in_pg(m, pg, nr_bytes);
+                       if (!IS_ERR_OR_NULL(pgpath)) {
                                if (!bypassed)
                                        set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
-                               return;
+                               return pgpath;
                        }
                }
        } while (bypassed--);
 
 failed:
+       spin_lock_irqsave(&m->lock, flags);
        m->current_pgpath = NULL;
        m->current_pg = NULL;
+       spin_unlock_irqrestore(&m->lock, flags);
+
+       return NULL;
 }
 
 /*
  * Check whether bios must be queued in the device-mapper core rather
  * than here in the target.
  *
- * m->lock must be held on entry.
- *
  * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
  * same value then we are not between multipath_presuspend()
  * and multipath_resume() calls and we have no need to check
  * for the DMF_NOFLUSH_SUSPENDING flag.
  */
-static int __must_push_back(struct multipath *m)
+static int must_push_back(struct multipath *m)
 {
        return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
                ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
@@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
        struct block_device *bdev;
        struct dm_mpath_io *mpio;
 
-       spin_lock_irq(&m->lock);
-
        /* Do we need to select a new pgpath? */
-       if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
-               __choose_pgpath(m, nr_bytes);
-
-       pgpath = m->current_pgpath;
+       pgpath = lockless_dereference(m->current_pgpath);
+       if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
+               pgpath = choose_pgpath(m, nr_bytes);
 
        if (!pgpath) {
-               if (!__must_push_back(m))
+               if (!must_push_back(m))
                        r = -EIO;       /* Failed */
-               goto out_unlock;
+               return r;
        } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
                   test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
-               __pg_init_all_paths(m);
-               goto out_unlock;
+               pg_init_all_paths(m);
+               return r;
        }
 
        mpio = set_mpio(m, map_context);
        if (!mpio)
                /* ENOMEM, requeue */
-               goto out_unlock;
+               return r;
 
        mpio->pgpath = pgpath;
        mpio->nr_bytes = nr_bytes;
 
        bdev = pgpath->path.dev->bdev;
 
-       spin_unlock_irq(&m->lock);
-
        if (clone) {
                /*
                 * Old request-based interface: allocated clone is passed in.
@@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
                                              &pgpath->path,
                                              nr_bytes);
        return DM_MAPIO_REMAPPED;
-
-out_unlock:
-       spin_unlock_irq(&m->lock);
-
-       return r;
 }
 
 static int multipath_map(struct dm_target *ti, struct request *clone,
@@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
         * clone bios for it and resubmit it later.
         */
        int r = DM_ENDIO_REQUEUE;
-       unsigned long flags;
 
        if (!error && !clone->errors)
                return 0;       /* I/O complete */
@@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
        if (mpio->pgpath)
                fail_path(mpio->pgpath);
 
-       spin_lock_irqsave(&m->lock, flags);
        if (!atomic_read(&m->nr_valid_paths)) {
                if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
-                       if (!__must_push_back(m))
+                       if (!must_push_back(m))
                                r = -EIO;
                } else {
                        if (error == -EBADE)
                                r = error;
                }
        }
-       spin_unlock_irqrestore(&m->lock, flags);
 
        return r;
 }
@@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
                struct block_device **bdev, fmode_t *mode)
 {
        struct multipath *m = ti->private;
-       unsigned long flags;
+       struct pgpath *current_pgpath;
        int r;
 
-       spin_lock_irqsave(&m->lock, flags);
-
-       if (!m->current_pgpath)
-               __choose_pgpath(m, 0);
+       current_pgpath = lockless_dereference(m->current_pgpath);
+       if (!current_pgpath)
+               current_pgpath = choose_pgpath(m, 0);
 
-       if (m->current_pgpath) {
+       if (current_pgpath) {
                if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
-                       *bdev = m->current_pgpath->path.dev->bdev;
-                       *mode = m->current_pgpath->path.dev->mode;
+                       *bdev = current_pgpath->path.dev->bdev;
+                       *mode = current_pgpath->path.dev->mode;
                        r = 0;
                } else {
                        /* pg_init has not started or completed */
@@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
                        r = -EIO;
        }
 
-       spin_unlock_irqrestore(&m->lock, flags);
-
        if (r == -ENOTCONN) {
-               spin_lock_irqsave(&m->lock, flags);
-               if (!m->current_pg) {
+               if (!lockless_dereference(m->current_pg)) {
                        /* Path status changed, redo selection */
-                       __choose_pgpath(m, 0);
+                       (void) choose_pgpath(m, 0);
                }
                if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
-                       __pg_init_all_paths(m);
-               spin_unlock_irqrestore(&m->lock, flags);
+                       pg_init_all_paths(m);
                dm_table_run_md_queue_async(m->ti->table);
        }
 
@@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
 {
        bool busy = false, has_active = false;
        struct multipath *m = ti->private;
-       struct priority_group *pg;
+       struct priority_group *pg, *next_pg;
        struct pgpath *pgpath;
-       unsigned long flags;
-
-       spin_lock_irqsave(&m->lock, flags);
 
        /* pg_init in progress or no paths available */
        if (atomic_read(&m->pg_init_in_progress) ||
-           (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
-               busy = true;
-               goto out;
-       }
+           (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
+               return true;
+
        /* Guess which priority_group will be used at next mapping time */
-       if (unlikely(!m->current_pgpath && m->next_pg))
-               pg = m->next_pg;
-       else if (likely(m->current_pg))
-               pg = m->current_pg;
-       else
+       pg = lockless_dereference(m->current_pg);
+       next_pg = lockless_dereference(m->next_pg);
+       if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
+               pg = next_pg;
+
+       if (!pg) {
                /*
                 * We don't know which pg will be used at next mapping time.
-                * We don't call __choose_pgpath() here to avoid to trigger
+                * We don't call choose_pgpath() here to avoid to trigger
                 * pg_init just by busy checking.
                 * So we don't know whether underlying devices we will be using
                 * at next mapping time are busy or not. Just try mapping.
                 */
-               goto out;
+               return busy;
+       }
 
        /*
         * If there is one non-busy active path at least, the path selector
         * will be able to select it. So we consider such a pg as not busy.
         */
        busy = true;
-       list_for_each_entry(pgpath, &pg->pgpaths, list)
+       list_for_each_entry(pgpath, &pg->pgpaths, list) {
                if (pgpath->is_active) {
                        has_active = true;
                        if (!pgpath_busy(pgpath)) {
@@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
                                break;
                        }
                }
+       }
 
-       if (!has_active)
+       if (!has_active) {
                /*
                 * No active path in this pg, so this pg won't be used and
                 * the current_pg will be changed at next mapping time.
                 * We need to try mapping to determine it.
                 */
                busy = false;
-
-out:
-       spin_unlock_irqrestore(&m->lock, flags);
+       }
 
        return busy;
 }