xfs: improve handling of busy extents in the low-level allocator
authorChristoph Hellwig <hch@lst.de>
Tue, 7 Feb 2017 22:06:57 +0000 (14:06 -0800)
committerDarrick J. Wong <darrick.wong@oracle.com>
Thu, 9 Feb 2017 18:50:25 +0000 (10:50 -0800)
Currently we force the log and simply try again if we hit a busy extent,
but especially with online discard enabled it might take a while after
the log force for the busy extents to disappear, and we might have
already completed our second pass.

So instead we add a new waitqueue and a generation counter to the pag
structure so that we can do wakeups once we've removed busy extents,
and we replace the single retry with an unconditional one - after
all we hold the AGF buffer lock, so no other allocations or frees
can be racing with us in this AG.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/xfs_extent_busy.c
fs/xfs/xfs_extent_busy.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h

index 9f06a211e1570549cb8df3cdba2de2d01a79f890..fe98fbc4adf17ebe5260935d7eeaed995f0f8148 100644 (file)
@@ -221,20 +221,22 @@ xfs_alloc_get_rec(
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC void
+STATIC bool
 xfs_alloc_compute_aligned(
        xfs_alloc_arg_t *args,          /* allocation argument structure */
        xfs_agblock_t   foundbno,       /* starting block in found extent */
        xfs_extlen_t    foundlen,       /* length in found extent */
        xfs_agblock_t   *resbno,        /* result block number */
-       xfs_extlen_t    *reslen)        /* result length */
+       xfs_extlen_t    *reslen,        /* result length */
+       unsigned        *busy_gen)
 {
-       xfs_agblock_t   bno;
-       xfs_extlen_t    len;
+       xfs_agblock_t   bno = foundbno;
+       xfs_extlen_t    len = foundlen;
        xfs_extlen_t    diff;
+       bool            busy;
 
        /* Trim busy sections out of found extent */
-       xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+       busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 
        /*
         * If we have a largish extent that happens to start before min_agbno,
@@ -259,6 +261,8 @@ xfs_alloc_compute_aligned(
                *resbno = bno;
                *reslen = len;
        }
+
+       return busy;
 }
 
 /*
@@ -737,10 +741,11 @@ xfs_alloc_ag_vextent_exact(
        int             error;
        xfs_agblock_t   fbno;   /* start block of found extent */
        xfs_extlen_t    flen;   /* length of found extent */
-       xfs_agblock_t   tbno;   /* start block of trimmed extent */
-       xfs_extlen_t    tlen;   /* length of trimmed extent */
-       xfs_agblock_t   tend;   /* end block of trimmed extent */
+       xfs_agblock_t   tbno;   /* start block of busy extent */
+       xfs_extlen_t    tlen;   /* length of busy extent */
+       xfs_agblock_t   tend;   /* end block of busy extent */
        int             i;      /* success/failure of operation */
+       unsigned        busy_gen;
 
        ASSERT(args->alignment == 1);
 
@@ -773,7 +778,9 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Check for overlapping busy extents.
         */
-       xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+       tbno = fbno;
+       tlen = flen;
+       xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
 
        /*
         * Give up if the start of the extent is busy, or the freespace isn't
@@ -853,6 +860,7 @@ xfs_alloc_find_best_extent(
        xfs_agblock_t           sdiff;
        int                     error;
        int                     i;
+       unsigned                busy_gen;
 
        /* The good extent is perfect, no need to  search. */
        if (!gdiff)
@@ -866,7 +874,8 @@ xfs_alloc_find_best_extent(
                if (error)
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-               xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+               xfs_alloc_compute_aligned(args, *sbno, *slen,
+                               sbnoa, slena, &busy_gen);
 
                /*
                 * The good extent is closer than this one.
@@ -955,7 +964,8 @@ xfs_alloc_ag_vextent_near(
        xfs_extlen_t    ltlena;         /* aligned ... */
        xfs_agblock_t   ltnew;          /* useful start bno of left side */
        xfs_extlen_t    rlen;           /* length of returned extent */
-       int             forced = 0;
+       bool            busy;
+       unsigned        busy_gen;
 #ifdef DEBUG
        /*
         * Randomly don't execute the first algorithm.
@@ -982,6 +992,7 @@ restart:
        ltlen = 0;
        gtlena = 0;
        ltlena = 0;
+       busy = false;
 
        /*
         * Get a cursor for the by-size btree.
@@ -1064,8 +1075,8 @@ restart:
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                                 &ltbnoa, &ltlena);
+                       busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                       &ltbnoa, &ltlena, &busy_gen);
                        if (ltlena < args->minlen)
                                continue;
                        if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
@@ -1183,8 +1194,8 @@ restart:
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-                       xfs_alloc_compute_aligned(args, ltbno, ltlen,
-                                                 &ltbnoa, &ltlena);
+                       busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
+                                       &ltbnoa, &ltlena, &busy_gen);
                        if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
                                break;
                        if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1199,8 +1210,8 @@ restart:
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-                       xfs_alloc_compute_aligned(args, gtbno, gtlen,
-                                                 &gtbnoa, &gtlena);
+                       busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
+                                       &gtbnoa, &gtlena, &busy_gen);
                        if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
                                break;
                        if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1261,9 +1272,9 @@ restart:
        if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-               if (!forced++) {
+               if (busy) {
                        trace_xfs_alloc_near_busy(args);
-                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
                        goto restart;
                }
                trace_xfs_alloc_size_neither(args);
@@ -1344,7 +1355,8 @@ xfs_alloc_ag_vextent_size(
        int             i;              /* temp status variable */
        xfs_agblock_t   rbno;           /* returned block number */
        xfs_extlen_t    rlen;           /* length of returned extent */
-       int             forced = 0;
+       bool            busy;
+       unsigned        busy_gen;
 
 restart:
        /*
@@ -1353,6 +1365,7 @@ restart:
        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
                args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
+       busy = false;
 
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1362,14 +1375,13 @@ restart:
                goto error0;
 
        /*
-        * If none or we have busy extents that we cannot allocate from, then
-        * we have to settle for a smaller extent. In the case that there are
-        * no large extents, this will return the last entry in the tree unless
-        * the tree is empty. In the case that there are only busy large
-        * extents, this will return the largest small extent unless there
+        * If none then we have to settle for a smaller extent. In the case that
+        * there are no large extents, this will return the last entry in the
+        * tree unless the tree is empty. In the case that there are only busy
+        * large extents, this will return the largest small extent unless there
         * are no smaller extents available.
         */
-       if (!i || forced > 1) {
+       if (!i) {
                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
                                                   &fbno, &flen, &i);
                if (error)
@@ -1380,13 +1392,11 @@ restart:
                        return 0;
                }
                ASSERT(i == 1);
-               xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+               busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
+                               &rlen, &busy_gen);
        } else {
                /*
                 * Search for a non-busy extent that is large enough.
-                * If we are at low space, don't check, or if we fall of
-                * the end of the btree, turn off the busy check and
-                * restart.
                 */
                for (;;) {
                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
@@ -1394,8 +1404,8 @@ restart:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 
-                       xfs_alloc_compute_aligned(args, fbno, flen,
-                                                 &rbno, &rlen);
+                       busy = xfs_alloc_compute_aligned(args, fbno, flen,
+                                       &rbno, &rlen, &busy_gen);
 
                        if (rlen >= args->maxlen)
                                break;
@@ -1407,18 +1417,13 @@ restart:
                                /*
                                 * Our only valid extents must have been busy.
                                 * Make it unbusy by forcing the log out and
-                                * retrying. If we've been here before, forcing
-                                * the log isn't making the extents available,
-                                * which means they have probably been freed in
-                                * this transaction.  In that case, we have to
-                                * give up on them and we'll attempt a minlen
-                                * allocation the next time around.
+                                * retrying.
                                 */
                                xfs_btree_del_cursor(cnt_cur,
                                                     XFS_BTREE_NOERROR);
                                trace_xfs_alloc_size_busy(args);
-                               if (!forced++)
-                                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                               xfs_extent_busy_flush(args->mp,
+                                                       args->pag, busy_gen);
                                goto restart;
                        }
                }
@@ -1454,8 +1459,8 @@ restart:
                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        if (flen < bestrlen)
                                break;
-                       xfs_alloc_compute_aligned(args, fbno, flen,
-                                                 &rbno, &rlen);
+                       busy = xfs_alloc_compute_aligned(args, fbno, flen,
+                                       &rbno, &rlen, &busy_gen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
                        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1484,10 +1489,10 @@ restart:
         */
        args->len = rlen;
        if (rlen < args->minlen) {
-               if (!forced++) {
+               if (busy) {
                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
                        trace_xfs_alloc_size_busy(args);
-                       xfs_log_force(args->mp, XFS_LOG_SYNC);
+                       xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
                        goto restart;
                }
                goto out_nominleft;
index 29c2f997aedf49feedc5a6c447a92904a9b215c1..4d850e27095e835bea789baa043e16dbd3b4c621 100644 (file)
@@ -334,25 +334,31 @@ restart:
  * subset of the extent that is not busy.  If *rlen is smaller than
  * args->minlen no suitable extent could be found, and the higher level
  * code needs to force out the log and retry the allocation.
+ *
+ * Return the current busy generation for the AG if the extent is busy. This
+ * value can be used to wait for at least one of the currently busy extents
+ * to be cleared. Note that the busy list is not guaranteed to be empty after
+ * the gen is woken. The state of a specific extent must always be confirmed
+ * with another call to xfs_extent_busy_trim() before it can be used.
  */
-void
+bool
 xfs_extent_busy_trim(
        struct xfs_alloc_arg    *args,
-       xfs_agblock_t           bno,
-       xfs_extlen_t            len,
-       xfs_agblock_t           *rbno,
-       xfs_extlen_t            *rlen)
+       xfs_agblock_t           *bno,
+       xfs_extlen_t            *len,
+       unsigned                *busy_gen)
 {
        xfs_agblock_t           fbno;
        xfs_extlen_t            flen;
        struct rb_node          *rbp;
+       bool                    ret = false;
 
        ASSERT(len > 0);
 
        spin_lock(&args->pag->pagb_lock);
 restart:
-       fbno = bno;
-       flen = len;
+       fbno = *bno;
+       flen = *len;
        rbp = args->pag->pagb_tree.rb_node;
        while (rbp && flen >= args->minlen) {
                struct xfs_extent_busy *busyp =
@@ -504,24 +510,25 @@ restart:
 
                flen = fend - fbno;
        }
-       spin_unlock(&args->pag->pagb_lock);
+out:
 
-       if (fbno != bno || flen != len) {
-               trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+       if (fbno != *bno || flen != *len) {
+               trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
                                          fbno, flen);
+               *bno = fbno;
+               *len = flen;
+               *busy_gen = args->pag->pagb_gen;
+               ret = true;
        }
-       *rbno = fbno;
-       *rlen = flen;
-       return;
+       spin_unlock(&args->pag->pagb_lock);
+       return ret;
 fail:
        /*
         * Return a zero extent length as failure indications.  All callers
         * re-check if the trimmed extent satisfies the minlen requirement.
         */
-       spin_unlock(&args->pag->pagb_lock);
-       trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
-       *rbno = fbno;
-       *rlen = 0;
+       flen = 0;
+       goto out;
 }
 
 STATIC void
@@ -540,6 +547,21 @@ xfs_extent_busy_clear_one(
        kmem_free(busyp);
 }
 
+static void
+xfs_extent_busy_put_pag(
+       struct xfs_perag        *pag,
+       bool                    wakeup)
+               __releases(pag->pagb_lock)
+{
+       if (wakeup) {
+               pag->pagb_gen++;
+               wake_up_all(&pag->pagb_wait);
+       }
+
+       spin_unlock(&pag->pagb_lock);
+       xfs_perag_put(pag);
+}
+
 /*
  * Remove all extents on the passed in list from the busy extents tree.
  * If do_discard is set skip extents that need to be discarded, and mark
@@ -554,27 +576,76 @@ xfs_extent_busy_clear(
        struct xfs_extent_busy  *busyp, *n;
        struct xfs_perag        *pag = NULL;
        xfs_agnumber_t          agno = NULLAGNUMBER;
+       bool                    wakeup = false;
 
        list_for_each_entry_safe(busyp, n, list, list) {
                if (busyp->agno != agno) {
-                       if (pag) {
-                               spin_unlock(&pag->pagb_lock);
-                               xfs_perag_put(pag);
-                       }
-                       pag = xfs_perag_get(mp, busyp->agno);
-                       spin_lock(&pag->pagb_lock);
+                       if (pag)
+                               xfs_extent_busy_put_pag(pag, wakeup);
                        agno = busyp->agno;
+                       pag = xfs_perag_get(mp, agno);
+                       spin_lock(&pag->pagb_lock);
+                       wakeup = false;
                }
 
                if (do_discard && busyp->length &&
-                   !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+                   !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
                        busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
-               else
+               } else {
                        xfs_extent_busy_clear_one(mp, pag, busyp);
+                       wakeup = true;
+               }
        }
 
-       if (pag) {
-               spin_unlock(&pag->pagb_lock);
+       if (pag)
+               xfs_extent_busy_put_pag(pag, wakeup);
+}
+
+/*
+ * Flush out all busy extents for this AG.
+ */
+void
+xfs_extent_busy_flush(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       unsigned                busy_gen)
+{
+       DEFINE_WAIT             (wait);
+       int                     log_flushed = 0, error;
+
+       trace_xfs_log_force(mp, 0, _THIS_IP_);
+       error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+       if (error)
+               return;
+
+       do {
+               prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+               if  (busy_gen != READ_ONCE(pag->pagb_gen))
+                       break;
+               schedule();
+       } while (1);
+
+       finish_wait(&pag->pagb_wait, &wait);
+}
+
+void
+xfs_extent_busy_wait_all(
+       struct xfs_mount        *mp)
+{
+       DEFINE_WAIT             (wait);
+       xfs_agnumber_t          agno;
+
+       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+               struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+               do {
+                       prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+                       if  (RB_EMPTY_ROOT(&pag->pagb_tree))
+                               break;
+                       schedule();
+               } while (1);
+               finish_wait(&pag->pagb_wait, &wait);
+
                xfs_perag_put(pag);
        }
 }
index bfff284d2dcce434a16c72df62713098a812dfd4..60195ea1b84ab0f99cfda8158bdac399a3ac5201 100644 (file)
@@ -58,9 +58,16 @@ void
 xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
        xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
 
+bool
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
+               xfs_extlen_t *len, unsigned *busy_gen);
+
+void
+xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
+       unsigned busy_gen);
+
 void
-xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
-       xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+xfs_extent_busy_wait_all(struct xfs_mount *mp);
 
 int
 xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
index 1f1e4ae441504bd60af93244eebd485a433330ce..566eb79666ad9b6b57bc789ca2d8b3778a732e68 100644 (file)
@@ -45,6 +45,7 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_extent_busy.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -213,6 +214,7 @@ xfs_initialize_perag(
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                if (xfs_buf_hash_init(pag))
                        goto out_free_pag;
+               init_waitqueue_head(&pag->pagb_wait);
 
                if (radix_tree_preload(GFP_NOFS))
                        goto out_hash_destroy;
@@ -1078,6 +1080,12 @@ xfs_unmountfs(
         */
        xfs_log_force(mp, XFS_LOG_SYNC);
 
+       /*
+        * Wait for all busy extents to be freed, including completion of
+        * any discard operation.
+        */
+       xfs_extent_busy_wait_all(mp);
+
        /*
         * We now need to tell the world we are unmounting. This will allow
         * us to detect that the filesystem is going away and we should error
index 7f351f706b7a2938292680801c9fa0bf51458c1c..20e2981a4aec6bbd157a1a7f8b440d4f55c7f3d4 100644 (file)
@@ -384,6 +384,8 @@ typedef struct xfs_perag {
        xfs_agino_t     pagl_rightrec;
        spinlock_t      pagb_lock;      /* lock for pagb_tree */
        struct rb_root  pagb_tree;      /* ordered tree of busy extents */
+       unsigned int    pagb_gen;       /* generation count for pagb_tree */
+       wait_queue_head_t pagb_wait;    /* woken when pagb_gen changes */
 
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */