xfs: allocate log vector buffers outside CIL context lock

author Dave Chinner <dchinner@redhat.com>

Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)

committer Dave Chinner <david@fromorbit.com>

Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)
author Dave Chinner <dchinner@redhat.com>
Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)
committer Dave Chinner <david@fromorbit.com>
Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c

index 34257992934c4184feba8505c8342e2e0a1e9edc..4561b1e2f2f9c1d34cca44b3725649a1ef0000f7 100644 (file)
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -949,6 +949,7 @@ xfs_buf_item_free(
         xfs_buf_log_item_t      *bip)
  {
         xfs_buf_item_free_format(bip);
+       kmem_free(bip->bli_item.li_lv_shadow);
         kmem_zone_free(xfs_buf_item_zone, bip);
  }
  
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index e0646659ce16eaafa7b35833fac127fd13b80582..ccb0811963b27d2b112b23df77337ab9b2c54c2c 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
  {
         ASSERT(list_empty(&dqp->q_lru));
  
+       kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
         mutex_destroy(&dqp->q_qlock);
  
         XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c

index 814cff94e78f6752bb9bf70d61697320c409f9e7..2c7a1629e064b4fd1f647affc3432797d058ae26 100644 (file)
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
         spin_lock(&ailp->xa_lock);
         xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
  
+       kmem_free(qfs->qql_item.li_lv_shadow);
+       kmem_free(lip->li_lv_shadow);
         kmem_free(qfs);
         kmem_free(qfe);
         return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c

index 4aa0153214f91fb9ec03621c0b006317ca3d0e09..ab779460ecbf3ab8d4c89fecdde1b4a48e66b746 100644 (file)
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
  xfs_efi_item_free(
         struct xfs_efi_log_item *efip)
  {
+       kmem_free(efip->efi_item.li_lv_shadow);
         if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
                 kmem_free(efip);
         else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
  STATIC void
  xfs_efd_item_free(struct xfs_efd_log_item *efdp)
  {
+       kmem_free(efdp->efd_item.li_lv_shadow);
         if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
                 kmem_free(efdp);
         else
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c

index a1b07612224c95ea56391474a4bb790fec07fab6..892c2aced20788cb9875dc3a9cddee78dc0b1b8b 100644 (file)
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
  xfs_inode_item_destroy(
         xfs_inode_t     *ip)
  {
+       kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
         kmem_zone_free(xfs_ili_zone, ip->i_itemp);
  }
  
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c

index 5e54e7955ea638a7c8fbbd885080fbf0005282f7..a4ab192e1792d1609e4cc309eceeefb5a71ff32f 100644 (file)
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
         log->l_cilp->xc_ctx->sequence = 1;
  }
  
+static inline int
+xlog_cil_iovec_space(
+       uint    niovecs)
+{
+       return round_up((sizeof(struct xfs_log_vec) +
+                                       niovecs * sizeof(struct xfs_log_iovec)),
+                       sizeof(uint64_t));
+}
+
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *     a) does not exist; or
+ *     b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+       struct xlog             *log,
+       struct xfs_trans        *tp)
+{
+       struct xfs_log_item_desc *lidp;
+
+       list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+               struct xfs_log_item *lip = lidp->lid_item;
+               struct xfs_log_vec *lv;
+               int     niovecs = 0;
+               int     nbytes = 0;
+               int     buf_size;
+               bool    ordered = false;
+
+               /* Skip items which aren't dirty in this transaction. */
+               if (!(lidp->lid_flags & XFS_LID_DIRTY))
+                       continue;
+
+               /* get number of vecs and size of data to be stored */
+               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+
+               /*
+                * Ordered items need to be tracked but we do not wish to write
+                * them. We need a logvec to track the object, but we do not
+                * need an iovec or buffer to be allocated for copying data.
+                */
+               if (niovecs == XFS_LOG_VEC_ORDERED) {
+                       ordered = true;
+                       niovecs = 0;
+                       nbytes = 0;
+               }
+
+               /*
+                * We 64-bit align the length of each iovec so that the start
+                * of the next one is naturally aligned.  We'll need to
+                * account for that slack space here. Then round nbytes up
+                * to 64-bit alignment so that the initial buffer alignment is
+                * easy to calculate and verify.
+                */
+               nbytes += niovecs * sizeof(uint64_t);
+               nbytes = round_up(nbytes, sizeof(uint64_t));
+
+               /*
+                * The data buffer needs to start 64-bit aligned, so round up
+                * that space to ensure we can align it appropriately and not
+                * overrun the buffer.
+                */
+               buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+
+               /*
+                * if we have no shadow buffer, or it is too small, we need to
+                * reallocate it.
+                */
+               if (!lip->li_lv_shadow ||
+                   buf_size > lip->li_lv_shadow->lv_size) {
+
+                       /*
+                        * We free and allocate here as a realloc would copy
+                        * unecessary data. We don't use kmem_zalloc() for the
+                        * same reason - we don't need to zero the data area in
+                        * the buffer, only the log vector header and the iovec
+                        * storage.
+                        */
+                       kmem_free(lip->li_lv_shadow);
+
+                       lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+                       memset(lv, 0, xlog_cil_iovec_space(niovecs));
+
+                       lv->lv_item = lip;
+                       lv->lv_size = buf_size;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+                       lip->li_lv_shadow = lv;
+               } else {
+                       /* same or smaller, optimise common overwrite case */
+                       lv = lip->li_lv_shadow;
+                       if (ordered)
+                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+                       else
+                               lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_next = NULL;
+               }
+
+               /* Ensure the lv is set up according to ->iop_size */
+               lv->lv_niovecs = niovecs;
+
+               /* The allocated data region lies beyond the iovec region */
+               lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+       }
+
+}
+
  /*
   * Prepare the log item for insertion into the CIL. Calculate the difference in
   * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
         /*
          * If there is no old LV, this is the first time we've seen the item in
          * this CIL context and so we need to pin it. If we are replacing the
-        * old_lv, then remove the space it accounts for and free it.
+        * old_lv, then remove the space it accounts for and make it the shadow
+        * buffer for later freeing. In both cases we are now switching to the
+        * shadow buffer, so update the the pointer to it appropriately.
          */
-       if (!old_lv)
+       if (!old_lv) {
                 lv->lv_item->li_ops->iop_pin(lv->lv_item);
-       else if (old_lv != lv) {
+               lv->lv_item->li_lv_shadow = NULL;
+       } else if (old_lv != lv) {
                 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
  
                 *diff_len -= old_lv->lv_bytes;
                 *diff_iovecs -= old_lv->lv_niovecs;
-               kmem_free(old_lv);
+               lv->lv_item->li_lv_shadow = old_lv;
         }
  
         /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
   * write it out asynchronously without needing to relock the object that was
   * modified at the time it gets written into the iclog.
   *
- * This function builds a vector for the changes in each log item in the
- * transaction. It then works out the length of the buffer needed for each log
- * item, allocates them and formats the vector for the item into the buffer.
- * The buffer is then attached to the log item are then inserted into the
- * Committed Item List for tracking until the next checkpoint is written out.
+ * This function takes the prepared log vectors attached to each log item, and
+ * formats the changes into the log vector buffer. The buffer it uses is
+ * dependent on the current state of the vector in the CIL - the shadow lv is
+ * guaranteed to be large enough for the current modification, but we will only
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
   *
   * We don't set up region headers during this process; we simply copy the
   * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
         list_for_each_entry(lidp, &tp->t_items, lid_trans) {
                 struct xfs_log_item *lip = lidp->lid_item;
                 struct xfs_log_vec *lv;
-               struct xfs_log_vec *old_lv;
-               int     niovecs = 0;
-               int     nbytes = 0;
-               int     buf_size;
+               struct xfs_log_vec *old_lv = NULL;
+               struct xfs_log_vec *shadow;
                 bool    ordered = false;
  
                 /* Skip items which aren't dirty in this transaction. */
                 if (!(lidp->lid_flags & XFS_LID_DIRTY))
                         continue;
  
-               /* get number of vecs and size of data to be stored */
-               lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-
-               /* Skip items that do not have any vectors for writing */
-               if (!niovecs)
-                       continue;
-
                 /*
-                * Ordered items need to be tracked but we do not wish to write
-                * them. We need a logvec to track the object, but we do not
-                * need an iovec or buffer to be allocated for copying data.
+                * The formatting size information is already attached to
+                * the shadow lv on the log item.
                  */
-               if (niovecs == XFS_LOG_VEC_ORDERED) {
+               shadow = lip->li_lv_shadow;
+               if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
                         ordered = true;
-                       niovecs = 0;
-                       nbytes = 0;
-               }
  
-               /*
-                * We 64-bit align the length of each iovec so that the start
-                * of the next one is naturally aligned.  We'll need to
-                * account for that slack space here. Then round nbytes up
-                * to 64-bit alignment so that the initial buffer alignment is
-                * easy to calculate and verify.
-                */
-               nbytes += niovecs * sizeof(uint64_t);
-               nbytes = round_up(nbytes, sizeof(uint64_t));
-
-               /* grab the old item if it exists for reservation accounting */
-               old_lv = lip->li_lv;
-
-               /*
-                * The data buffer needs to start 64-bit aligned, so round up
-                * that space to ensure we can align it appropriately and not
-                * overrun the buffer.
-                */
-               buf_size = nbytes +
-                          round_up((sizeof(struct xfs_log_vec) +
-                                    niovecs * sizeof(struct xfs_log_iovec)),
-                                   sizeof(uint64_t));
+               /* Skip items that do not have any vectors for writing */
+               if (!shadow->lv_niovecs && !ordered)
+                       continue;
  
                 /* compare to existing item size */
-               if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+               old_lv = lip->li_lv;
+               if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
                         /* same or smaller, optimise common overwrite case */
                         lv = lip->li_lv;
                         lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
                          */
                         *diff_iovecs -= lv->lv_niovecs;
                         *diff_len -= lv->lv_bytes;
+
+                       /* Ensure the lv is set up according to ->iop_size */
+                       lv->lv_niovecs = shadow->lv_niovecs;
+
+                       /* reset the lv buffer information for new formatting */
+                       lv->lv_buf_len = 0;
+                       lv->lv_bytes = 0;
+                       lv->lv_buf = (char *)lv +
+                                       xlog_cil_iovec_space(lv->lv_niovecs);
                 } else {
-                       /* allocate new data chunk */
-                       lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+                       /* switch to shadow buffer! */
+                       lv = shadow;
                         lv->lv_item = lip;
-                       lv->lv_size = buf_size;
                         if (ordered) {
                                 /* track as an ordered logvec */
                                 ASSERT(lip->li_lv == NULL);
-                               lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
                                 goto insert;
                         }
-                       lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
                 }
  
-               /* Ensure the lv is set up according to ->iop_size */
-               lv->lv_niovecs = niovecs;
-
-               /* The allocated data region lies beyond the iovec region */
-               lv->lv_buf_len = 0;
-               lv->lv_bytes = 0;
-               lv->lv_buf = (char *)lv + buf_size - nbytes;
                 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
-
                 lip->li_ops->iop_format(lip, lv);
  insert:
-               ASSERT(lv->lv_buf_len <= nbytes);
                 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
         }
  }
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
         struct xlog             *log = mp->m_log;
         struct xfs_cil          *cil = log->l_cilp;
  
+       /*
+        * Do all necessary memory allocation before we lock the CIL.
+        * This ensures the allocation does not deadlock with a CIL
+        * push in memory reclaim (e.g. from kswapd).
+        */
+       xlog_cil_alloc_shadow_bufs(log, tp);
+
         /* lock out background commit */
         down_read(&cil->xc_ctx_lock);
  
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 9a462e892e4f33f9f50508b6ba8f33bfb348a420..9b2b9fa89331c10d57818218ccd8fc985a3325f6 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
         /* delayed logging */
         struct list_head                li_cil;         /* CIL pointers */
         struct xfs_log_vec              *li_lv;         /* active log vector */
+       struct xfs_log_vec              *li_lv_shadow;  /* standby vector */
         xfs_lsn_t                       li_seq;         /* CIL commit seq */
  } xfs_log_item_t;
author	Dave Chinner <dchinner@redhat.com>
	Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)
committer	Dave Chinner <david@fromorbit.com>
	Thu, 21 Jul 2016 23:52:35 +0000 (09:52 +1000)
fs/xfs/xfs_buf_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_dquot.c		patch \| blob \| blame \| history
fs/xfs/xfs_dquot_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_extfree_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_inode_item.c		patch \| blob \| blame \| history
fs/xfs/xfs_log_cil.c		patch \| blob \| blame \| history
fs/xfs/xfs_trans.h		patch \| blob \| blame \| history