xfs: use rhashtable to track buffer cache
authorLucas Stach <dev@lynxeye.de>
Wed, 7 Dec 2016 06:36:36 +0000 (17:36 +1100)
committerDave Chinner <david@fromorbit.com>
Wed, 7 Dec 2016 06:36:36 +0000 (17:36 +1100)
On filesystems with a lot of metadata and in metadata intensive workloads
xfs_buf_find() is showing up at the top of the CPU cycles trace. Most of
the CPU time is spent on CPU cache misses while traversing the rbtree.

As the buffer cache does not need any kind of ordering, but fast lookups
a hashtable is the natural data structure to use. The rhashtable
infrastructure provides a self-scaling hashtable implementation and
allows lookups to proceed while the table is going through a resize
operation.

This reduces the CPU-time spent for the lookups to 1/3 even for small
filesystems with a relatively small number of cached buffers, with
possibly much larger gains on higher loaded filesystems.

[dchinner: reduce minimum hash size to an acceptable size for large
   filesystems with many AGs with no active use.]
[dchinner: remove stale rbtree asserts.]
[dchinner: use xfs_buf_map for compare function argument.]
[dchinner: make functions static.]
[dchinner: remove redundant comments.]

Signed-off-by: Lucas Stach <dev@lynxeye.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_linux.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h

index b5b9bffe352074806910a064ae41a941c97678c8..516109424c9604634ae3ecbecc396122403cd86f 100644 (file)
@@ -219,7 +219,6 @@ _xfs_buf_alloc(
        init_completion(&bp->b_iowait);
        INIT_LIST_HEAD(&bp->b_lru);
        INIT_LIST_HEAD(&bp->b_list);
-       RB_CLEAR_NODE(&bp->b_rbnode);
        sema_init(&bp->b_sema, 0); /* held, no waiters */
        spin_lock_init(&bp->b_lock);
        XB_SET_OWNER(bp);
@@ -473,6 +472,62 @@ _xfs_buf_map_pages(
 /*
  *     Finding and Reading Buffers
  */
+static int
+_xfs_buf_obj_cmp(
+       struct rhashtable_compare_arg   *arg,
+       const void                      *obj)
+{
+       const struct xfs_buf_map        *map = arg->key;
+       const struct xfs_buf            *bp = obj;
+
+       /*
+        * The key hashing in the lookup path depends on the key being the
+        * first element of the compare_arg, make sure to assert this.
+        */
+       BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+       if (bp->b_bn != map->bm_bn)
+               return 1;
+
+       if (unlikely(bp->b_length != map->bm_len)) {
+               /*
+                * found a block number match. If the range doesn't
+                * match, the only way this is allowed is if the buffer
+                * in the cache is stale and the transaction that made
+                * it stale has not yet committed. i.e. we are
+                * reallocating a busy extent. Skip this buffer and
+                * continue searching for an exact match.
+                */
+               ASSERT(bp->b_flags & XBF_STALE);
+               return 1;
+       }
+       return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+       .min_size               = 32,   /* empty AGs have minimal footprint */
+       .nelem_hint             = 16,
+       .key_len                = sizeof(xfs_daddr_t),
+       .key_offset             = offsetof(struct xfs_buf, b_bn),
+       .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
+       .automatic_shrinking    = true,
+       .obj_cmpfn              = _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+       struct xfs_perag        *pag)
+{
+       spin_lock_init(&pag->pag_buf_lock);
+       return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+       struct xfs_perag        *pag)
+{
+       rhashtable_destroy(&pag->pag_buf_hash);
+}
 
 /*
  *     Look up, and creates if absent, a lockable buffer for
@@ -488,27 +543,24 @@ _xfs_buf_find(
        xfs_buf_t               *new_bp)
 {
        struct xfs_perag        *pag;
-       struct rb_node          **rbp;
-       struct rb_node          *parent;
        xfs_buf_t               *bp;
-       xfs_daddr_t             blkno = map[0].bm_bn;
+       struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
        xfs_daddr_t             eofs;
-       int                     numblks = 0;
        int                     i;
 
        for (i = 0; i < nmaps; i++)
-               numblks += map[i].bm_len;
+               cmap.bm_len += map[i].bm_len;
 
        /* Check for IOs smaller than the sector size / not sector aligned */
-       ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
-       ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+       ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+       ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 
        /*
         * Corrupted block numbers can get through to here, unfortunately, so we
         * have to check that the buffer falls within the filesystem bounds.
         */
        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-       if (blkno < 0 || blkno >= eofs) {
+       if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
                /*
                 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
                 * but none of the higher level infrastructure supports
@@ -516,53 +568,29 @@ _xfs_buf_find(
                 */
                xfs_alert(btp->bt_mount,
                          "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
-                         __func__, blkno, eofs);
+                         __func__, cmap.bm_bn, eofs);
                WARN_ON(1);
                return NULL;
        }
 
-       /* get tree root */
        pag = xfs_perag_get(btp->bt_mount,
-                               xfs_daddr_to_agno(btp->bt_mount, blkno));
+                           xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
 
-       /* walk tree */
        spin_lock(&pag->pag_buf_lock);
-       rbp = &pag->pag_buf_tree.rb_node;
-       parent = NULL;
-       bp = NULL;
-       while (*rbp) {
-               parent = *rbp;
-               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
-               if (blkno < bp->b_bn)
-                       rbp = &(*rbp)->rb_left;
-               else if (blkno > bp->b_bn)
-                       rbp = &(*rbp)->rb_right;
-               else {
-                       /*
-                        * found a block number match. If the range doesn't
-                        * match, the only way this is allowed is if the buffer
-                        * in the cache is stale and the transaction that made
-                        * it stale has not yet committed. i.e. we are
-                        * reallocating a busy extent. Skip this buffer and
-                        * continue searching to the right for an exact match.
-                        */
-                       if (bp->b_length != numblks) {
-                               ASSERT(bp->b_flags & XBF_STALE);
-                               rbp = &(*rbp)->rb_right;
-                               continue;
-                       }
-                       atomic_inc(&bp->b_hold);
-                       goto found;
-               }
+       bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+                                   xfs_buf_hash_params);
+       if (bp) {
+               atomic_inc(&bp->b_hold);
+               goto found;
        }
 
        /* No match found */
        if (new_bp) {
-               rb_link_node(&new_bp->b_rbnode, parent, rbp);
-               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
                /* the buffer keeps the perag reference until it is freed */
                new_bp->b_pag = pag;
+               rhashtable_insert_fast(&pag->pag_buf_hash,
+                                      &new_bp->b_rhash_head,
+                                      xfs_buf_hash_params);
                spin_unlock(&pag->pag_buf_lock);
        } else {
                XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +958,6 @@ xfs_buf_rele(
 
        if (!pag) {
                ASSERT(list_empty(&bp->b_lru));
-               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold)) {
                        xfs_buf_ioacct_dec(bp);
                        xfs_buf_free(bp);
@@ -938,8 +965,6 @@ xfs_buf_rele(
                return;
        }
 
-       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
        ASSERT(atomic_read(&bp->b_hold) > 0);
 
        release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1008,8 @@ xfs_buf_rele(
                }
 
                ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-               rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+               rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+                                      xfs_buf_hash_params);
                spin_unlock(&pag->pag_buf_lock);
                xfs_perag_put(pag);
                freebuf = true;
index 38c9a95bda7d7c42763dbb3a63f565bd67f8a98f..8a9d3a9599f00f410fe9b6e922b62242dfd8ab81 100644 (file)
@@ -151,7 +151,7 @@ typedef struct xfs_buf {
         * which is the only bit that is touched if we hit the semaphore
         * fast-path on locking.
         */
-       struct rb_node          b_rbnode;       /* rbtree node */
+       struct rhash_head       b_rhash_head;   /* pag buffer hash node */
        xfs_daddr_t             b_bn;           /* block number of buffer */
        int                     b_length;       /* size of buffer in BBs */
        atomic_t                b_hold;         /* reference count */
index 68640fb63a542346cac31553dd0aad1a78421a16..a415f822f2c1c15ce751fb44b9ab3c7bef1840e6 100644 (file)
@@ -78,6 +78,7 @@ typedef __u32                 xfs_nlink_t;
 #include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
index b341f10cf4810bf3716aec354f33fbdbf9ab5494..9b9540db17a6132472dc35c9ec2edaa5ace0b30d 100644 (file)
@@ -157,6 +157,7 @@ xfs_free_perag(
                spin_unlock(&mp->m_perag_lock);
                ASSERT(pag);
                ASSERT(atomic_read(&pag->pag_ref) == 0);
+               xfs_buf_hash_destroy(pag);
                call_rcu(&pag->rcu_head, __xfs_free_perag);
        }
 }
@@ -212,8 +213,8 @@ xfs_initialize_perag(
                spin_lock_init(&pag->pag_ici_lock);
                mutex_init(&pag->pag_ici_reclaim_lock);
                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-               spin_lock_init(&pag->pag_buf_lock);
-               pag->pag_buf_tree = RB_ROOT;
+               if (xfs_buf_hash_init(pag))
+                       goto out_unwind;
 
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
        return 0;
 
 out_unwind:
+       xfs_buf_hash_destroy(pag);
        kmem_free(pag);
        for (; index > first_initialised; index--) {
                pag = radix_tree_delete(&mp->m_perag_tree, index);
+               xfs_buf_hash_destroy(pag);
                kmem_free(pag);
        }
        return error;
index 819b80b15bfb11146962539a366ecf2efa093f96..84f785218907434276d12e6926d8e85974a14477 100644 (file)
@@ -393,8 +393,8 @@ typedef struct xfs_perag {
        unsigned long   pag_ici_reclaim_cursor; /* reclaim restart point */
 
        /* buffer cache index */
-       spinlock_t      pag_buf_lock;   /* lock for pag_buf_tree */
-       struct rb_root  pag_buf_tree;   /* ordered tree of active buffers */
+       spinlock_t      pag_buf_lock;   /* lock for pag_buf_hash */
+       struct rhashtable pag_buf_hash;
 
        /* for rcu-safe freeing */
        struct rcu_head rcu_head;
@@ -424,6 +424,9 @@ xfs_perag_resv(
        }
 }
 
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
 extern void    xfs_uuid_table_free(void);
 extern int     xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);