kill-the-bkl/reiserfs: unlock only when needed in search_by_key
authorFrederic Weisbecker <fweisbec@gmail.com>
Thu, 14 May 2009 00:56:39 +0000 (02:56 +0200)
committerFrederic Weisbecker <fweisbec@gmail.com>
Mon, 14 Sep 2009 05:18:22 +0000 (07:18 +0200)
search_by_key() is the site which most requires the lock.
This is mostly because it is a very central function and also
because it releases/reaqcuires the write lock at least once each
time it is called.

Such release/reacquire creates a lot of contention in this place and
also opens more the window which let another thread changing the tree.
When it happens, the current path searching over the tree must be
retried from the beggining (the root) which is a wasteful and
time consuming recovery.

This patch factorizes two release/reacquire sequences:

- reading leaf nodes blocks
- reading current block

The latter immediately follows the former.

The whole sequence is safe as a single unlocked section because
we check just after if the tree has changed during these operations.

Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
fs/reiserfs/stree.c

index 960c9114f6d348b959af129e343d251b82e929c7..6b025a42d510be137280eb28117865c5b8354369 100644 (file)
@@ -519,12 +519,22 @@ static int is_tree_node(struct buffer_head *bh, int level)
 
 #define SEARCH_BY_KEY_READA 16
 
-/* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada(struct super_block *s,
+/*
+ * The function is NOT SCHEDULE-SAFE!
+ * It might unlock the write lock if we needed to wait for a block
+ * to be read. Note that in this case it won't recover the lock to avoid
+ * high contention resulting from too much lock requests, especially
+ * the caller (search_by_key) will perform other schedule-unsafe
+ * operations just after calling this function.
+ *
+ * @return true if we have unlocked
+ */
+static bool search_by_key_reada(struct super_block *s,
                                struct buffer_head **bh,
                                b_blocknr_t *b, int num)
 {
        int i, j;
+       bool unlocked = false;
 
        for (i = 0; i < num; i++) {
                bh[i] = sb_getblk(s, b[i]);
@@ -536,16 +546,21 @@ static void search_by_key_reada(struct super_block *s,
         * the lock. But it's still fine because we check later
         * if the tree changed
         */
-       reiserfs_write_unlock(s);
        for (j = 0; j < i; j++) {
                /*
                 * note, this needs attention if we are getting rid of the BKL
                 * you have to make sure the prepared bit isn't set on this buffer
                 */
-               if (!buffer_uptodate(bh[j]))
+               if (!buffer_uptodate(bh[j])) {
+                       if (!unlocked) {
+                               reiserfs_write_unlock(s);
+                               unlocked = true;
+                       }
                        ll_rw_block(READA, 1, bh + j);
+               }
                brelse(bh[j]);
        }
+       return unlocked;
 }
 
 /**************************************************************************
@@ -633,15 +648,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key,      /* Key to s
                   have a pointer to it. */
                if ((bh = last_element->pe_buffer =
                     sb_getblk(sb, block_number))) {
+                       bool unlocked = false;
+
                        if (!buffer_uptodate(bh) && reada_count > 1)
-                               /* will unlock the write lock */
-                               search_by_key_reada(sb, reada_bh,
+                               /* may unlock the write lock */
+                               unlocked = search_by_key_reada(sb, reada_bh,
                                                    reada_blocks, reada_count);
-                       else
+                       /*
+                        * If we haven't already unlocked the write lock,
+                        * then we need to do that here before reading
+                        * the current block
+                        */
+                       if (!buffer_uptodate(bh) && !unlocked) {
                                reiserfs_write_unlock(sb);
+                               unlocked = true;
+                       }
                        ll_rw_block(READ, 1, &bh);
                        wait_on_buffer(bh);
-                       reiserfs_write_lock(sb);
+
+                       if (unlocked)
+                               reiserfs_write_lock(sb);
                        if (!buffer_uptodate(bh))
                                goto io_error;
                } else {