xfs: preallocate blocks for worst-case btree expansion
authorDarrick J. Wong <darrick.wong@oracle.com>
Mon, 3 Oct 2016 16:11:44 +0000 (09:11 -0700)
committerDarrick J. Wong <darrick.wong@oracle.com>
Wed, 5 Oct 2016 23:26:27 +0000 (16:26 -0700)
To gracefully handle the situation where a CoW operation turns a
single refcount extent into a lot of tiny ones and then run out of
space when a tree split has to happen, use the per-AG reserved block
pool to pre-allocate all the space we'll ever need for a maximal
btree.  For a 4K block size, this only costs an overhead of 0.3% of
available disk space.

When reflink is enabled, we have an unfortunate problem with rmap --
since we can share a block billions of times, this means that the
reverse mapping btree can expand basically infinitely.  When an AG is
so full that there are no free blocks with which to expand the rmapbt,
the filesystem will shut down hard.

This is rather annoying to the user, so use the AG reservation code to
reserve a "reasonable" amount of space for rmap.  We'll prevent
reflinks and CoW operations if we think we're getting close to
exhausting an AG's free space rather than shutting down, but this
permanent reservation should be enough for "most" users.  Hopefully.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
[hch@lst.de: ensure that we invalidate the freed btree buffer]
Signed-off-by: Christoph Hellwig <hch@lst.de>
fs/xfs/libxfs/xfs_ag_resv.c
fs/xfs/libxfs/xfs_refcount_btree.c
fs/xfs/libxfs/xfs_refcount_btree.h
fs/xfs/libxfs/xfs_rmap_btree.c
fs/xfs/libxfs/xfs_rmap_btree.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_fsops.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_super.c

index e3ae0f2b4294cae7cb8adec79886b18ba6fe34dd..adf770f0d011836ba971d6ac9d8fc250b56d2ae5 100644 (file)
@@ -38,6 +38,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -228,6 +229,11 @@ xfs_ag_resv_init(
        if (pag->pag_meta_resv.ar_asked == 0) {
                ask = used = 0;
 
+               error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+                               pag->pag_agno, &ask, &used);
+               if (error)
+                       goto out;
+
                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
                                ask, used);
                if (error)
@@ -238,6 +244,11 @@ xfs_ag_resv_init(
        if (pag->pag_agfl_resv.ar_asked == 0) {
                ask = used = 0;
 
+               error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+                               &ask, &used);
+               if (error)
+                       goto out;
+
                error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
                if (error)
                        goto out;
index 81d58b09af718eccb6851f7ec923af1756ca4d69..453bb2757ec23f334f351693fcfcdb5b5837ce35 100644 (file)
@@ -79,6 +79,8 @@ xfs_refcountbt_alloc_block(
        struct xfs_alloc_arg    args;           /* block allocation args */
        int                     error;          /* error return value */
 
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
        memset(&args, 0, sizeof(args));
        args.tp = cur->bc_tp;
        args.mp = cur->bc_mp;
@@ -88,6 +90,7 @@ xfs_refcountbt_alloc_block(
        args.firstblock = args.fsbno;
        xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
        args.minlen = args.maxlen = args.prod = 1;
+       args.resv = XFS_AG_RESV_METADATA;
 
        error = xfs_alloc_vextent(&args);
        if (error)
@@ -125,16 +128,19 @@ xfs_refcountbt_free_block(
        struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
        xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
        struct xfs_owner_info   oinfo;
+       int                     error;
 
        trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
                        XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
        xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
        be32_add_cpu(&agf->agf_refcount_blocks, -1);
        xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
-       xfs_bmap_add_free(mp, cur->bc_private.a.dfops, fsbno, 1,
-                       &oinfo);
+       error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+                       XFS_AG_RESV_METADATA);
+       if (error)
+               return error;
 
-       return 0;
+       return error;
 }
 
 STATIC int
@@ -387,3 +393,59 @@ xfs_refcountbt_compute_maxlevels(
        mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
                        mp->m_refc_mnr, mp->m_sb.sb_agblocks);
 }
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+       struct xfs_mount        *mp,
+       unsigned long long      len)
+{
+       return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+       struct xfs_mount        *mp)
+{
+       /* Bail out if we're uninitialized, which can happen in mkfs. */
+       if (mp->m_refc_mxr[0] == 0)
+               return 0;
+
+       return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_extlen_t            *ask,
+       xfs_extlen_t            *used)
+{
+       struct xfs_buf          *agbp;
+       struct xfs_agf          *agf;
+       xfs_extlen_t            tree_len;
+       int                     error;
+
+       if (!xfs_sb_version_hasreflink(&mp->m_sb))
+               return 0;
+
+       *ask += xfs_refcountbt_max_size(mp);
+
+       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+       if (error)
+               return error;
+
+       agf = XFS_BUF_TO_AGF(agbp);
+       tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+       xfs_buf_relse(agbp);
+
+       *used += tree_len;
+
+       return error;
+}
index 9e9ad7c6d267eb2f1fc7fb5ed296dbe3d7211d38..3be7768bd51a1c0ebd8c2ccc6e930a730bd39b54 100644 (file)
@@ -64,4 +64,11 @@ extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
                bool leaf);
 extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+               unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+               xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif /* __XFS_REFCOUNT_BTREE_H__ */
index 9c0585e7db1f4c46790d1f4fe70d68358e7aa398..83e672ff7577e9040d22668a308097922b998cab 100644 (file)
@@ -35,6 +35,7 @@
 #include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
 
 /*
  * Reverse map btree.
@@ -533,3 +534,62 @@ xfs_rmapbt_compute_maxlevels(
                mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
                                mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
 }
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+       struct xfs_mount        *mp,
+       unsigned long long      len)
+{
+       return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+       struct xfs_mount        *mp)
+{
+       /* Bail out if we're uninitialized, which can happen in mkfs. */
+       if (mp->m_rmap_mxr[0] == 0)
+               return 0;
+
+       return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_extlen_t            *ask,
+       xfs_extlen_t            *used)
+{
+       struct xfs_buf          *agbp;
+       struct xfs_agf          *agf;
+       xfs_extlen_t            pool_len;
+       xfs_extlen_t            tree_len;
+       int                     error;
+
+       if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+               return 0;
+
+       /* Reserve 1% of the AG or enough for 1 block per record. */
+       pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
+       *ask += pool_len;
+
+       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+       if (error)
+               return error;
+
+       agf = XFS_BUF_TO_AGF(agbp);
+       tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+       xfs_buf_relse(agbp);
+
+       *used += tree_len;
+
+       return error;
+}
index e73a55357dabe1f2a3f3c54bda6e5f49f52a4188..2a9ac472fb15a2408ca6d58addc6a5d439b61fe9 100644 (file)
@@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
 int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+               unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
+               xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif /* __XFS_RMAP_BTREE_H__ */
index 3acbf4e03187a34e71b42d299a8f4eac1ab18dc5..93d12fa2670d53bf8b6bf23c51325d48467b8ab2 100644 (file)
@@ -43,6 +43,7 @@
 #include "xfs_log.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
 
 /*
  * File system operations
@@ -630,6 +631,11 @@ xfs_growfs_data_private(
        xfs_set_low_space_thresholds(mp);
        mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
+       /* Reserve AG metadata blocks. */
+       error = xfs_fs_reserve_ag_blocks(mp);
+       if (error && error != -ENOSPC)
+               goto out;
+
        /* update secondary superblocks. */
        for (agno = 1; agno < nagcount; agno++) {
                error = 0;
@@ -680,6 +686,8 @@ xfs_growfs_data_private(
                        continue;
                }
        }
+
+ out:
        return saved_error ? saved_error : error;
 
  error0:
@@ -989,3 +997,59 @@ xfs_do_force_shutdown(
        "Please umount the filesystem and rectify the problem(s)");
        }
 }
+
+/*
+ * Reserve free space for per-AG metadata.
+ */
+int
+xfs_fs_reserve_ag_blocks(
+       struct xfs_mount        *mp)
+{
+       xfs_agnumber_t          agno;
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     err2;
+
+       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+               pag = xfs_perag_get(mp, agno);
+               err2 = xfs_ag_resv_init(pag);
+               xfs_perag_put(pag);
+               if (err2 && !error)
+                       error = err2;
+       }
+
+       if (error && error != -ENOSPC) {
+               xfs_warn(mp,
+       "Error %d reserving per-AG metadata reserve pool.", error);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+       }
+
+       return error;
+}
+
+/*
+ * Free space reserved for per-AG metadata.
+ */
+int
+xfs_fs_unreserve_ag_blocks(
+       struct xfs_mount        *mp)
+{
+       xfs_agnumber_t          agno;
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     err2;
+
+       for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+               pag = xfs_perag_get(mp, agno);
+               err2 = xfs_ag_resv_free(pag);
+               xfs_perag_put(pag);
+               if (err2 && !error)
+                       error = err2;
+       }
+
+       if (error)
+               xfs_warn(mp,
+       "Error %d freeing per-AG metadata reserve pool.", error);
+
+       return error;
+}
index f32713f14f9a21c1b752e2e8eb889dea72411f8e..f34915898fea25376099af70fd417442af171815 100644 (file)
@@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
                                xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
 
+extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+
 #endif /* __XFS_FSOPS_H__ */
index 099c00ecb078abadf0092794557968364b393093..40fedc00b30da948dae09a3e2735609f90ef21ca 100644 (file)
@@ -995,10 +995,17 @@ xfs_mountfs(
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                        goto out_quota;
                }
+
+               /* Reserve AG blocks for future btree expansion. */
+               error = xfs_fs_reserve_ag_blocks(mp);
+               if (error && error != -ENOSPC)
+                       goto out_agresv;
        }
 
        return 0;
 
+ out_agresv:
+       xfs_fs_unreserve_ag_blocks(mp);
  out_quota:
        xfs_qm_unmount_quotas(mp);
  out_rtunmount:
@@ -1043,6 +1050,7 @@ xfs_unmountfs(
 
        cancel_delayed_work_sync(&mp->m_eofblocks_work);
 
+       xfs_fs_unreserve_ag_blocks(mp);
        xfs_qm_unmount_quotas(mp);
        xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
index c32e7e61e7f390f26279fac430f3de2aa39f8806..90a8fd724abb5ed506fe610105fe459774fd8181 100644 (file)
@@ -1325,10 +1325,22 @@ xfs_fs_remount(
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                        return error;
                }
+
+               /* Create the per-AG metadata reservation pool .*/
+               error = xfs_fs_reserve_ag_blocks(mp);
+               if (error && error != -ENOSPC)
+                       return error;
        }
 
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+               /* Free the per-AG metadata reservation pool. */
+               error = xfs_fs_unreserve_ag_blocks(mp);
+               if (error) {
+                       xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                       return error;
+               }
+
                /*
                 * Before we sync the metadata, we need to free up the reserve
                 * block pool so that the used block count in the superblock on