jbd: Refine commit writeout logic
authorJan Kara <jack@suse.cz>
Mon, 21 Feb 2011 16:25:37 +0000 (17:25 +0100)
committerJan Kara <jack@suse.cz>
Wed, 11 Apr 2012 09:12:44 +0000 (11:12 +0200)
Currently we write out all journal buffers in WRITE_SYNC mode. This improves
performance for fsync heavy workloads but hinders performance when writes
are mostly asynchronous, most noticably it slows down readers and users
complain about slow desktop response etc.

So submit writes as asynchronous in the normal case and only submit writes as
WRITE_SYNC if we detect someone is waiting for current transaction commit.

I've gathered some numbers to back this change. The first is the read latency
test. It measures time to read 1 MB after several seconds of sleeping in
presence of streaming writes.

Top 10 times (out of 90) in us:
Before After
2131586 697473
1709932 557487
1564598 535642
1480462 347573
1478579 323153
1408496 222181
1388960 181273
1329565 181070
1252486 172832
1223265 172278

Average:
619377 82180

So the improvement in both maximum and average latency is massive.

I've measured fsync throughput by:
fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4

in presence of streaming reader. The numbers (fsyncs/s) are:
Before After
9.9 6.3
6.8 6.0
6.3 6.2
5.8 6.1

So fsync performance seems unharmed by this change.

Signed-off-by: Jan Kara <jack@suse.cz>
fs/jbd/commit.c
fs/jbd/journal.c
fs/jbd/transaction.c
include/linux/jbd.h
include/trace/events/jbd.h

index f2b9a571f4cf51f8a61995c43b131851c504f567..9d31e6a39205fcfd13b9d921c564fdc4f41533b6 100644 (file)
@@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal)
        int tag_flag;
        int i;
        struct blk_plug plug;
+       int write_op = WRITE;
 
        /*
         * First job: lock down the current transaction and wait for
@@ -413,13 +414,16 @@ void journal_commit_transaction(journal_t *journal)
 
        jbd_debug (3, "JBD: commit phase 2\n");
 
+       if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
+               write_op = WRITE_SYNC;
+
        /*
         * Now start flushing things to disk, in the order they appear
         * on the transaction lists.  Data blocks go first.
         */
        blk_start_plug(&plug);
        err = journal_submit_data_buffers(journal, commit_transaction,
-                                         WRITE_SYNC);
+                                         write_op);
        blk_finish_plug(&plug);
 
        /*
@@ -478,7 +482,7 @@ void journal_commit_transaction(journal_t *journal)
 
        blk_start_plug(&plug);
 
-       journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
+       journal_write_revoke_records(journal, commit_transaction, write_op);
 
        /*
         * If we found any dirty or locked buffers, then we should have
@@ -649,7 +653,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                               submit_bh(WRITE_SYNC, bh);
+                               submit_bh(write_op, bh);
                        }
                        cond_resched();
 
index 0971e9217808829a67f5fd340a1972a63a982930..2047fd77bf38213c5055e3342112c6c94c3917e1 100644 (file)
@@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
        spin_unlock(&journal->j_state_lock);
 #endif
        spin_lock(&journal->j_state_lock);
+       if (!tid_geq(journal->j_commit_waited, tid))
+               journal->j_commit_waited = tid;
        while (tid_gt(tid, journal->j_commit_sequence)) {
                jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
                                  tid, journal->j_commit_sequence);
index b2a7e5244e394d4bf88bcd4d8156d108d5d772e8..febc10db5cedb8d26e40d2a894105f0af3547735 100644 (file)
@@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle)
                }
        }
 
-       if (handle->h_sync)
-               transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
index d211732b9e99c8a6d95bff50b56505c651285a70..f265682ae1341fd74656042662b371d2c04a4032 100644 (file)
@@ -479,12 +479,6 @@ struct transaction_s
         * How many handles used this transaction? [t_handle_lock]
         */
        int t_handle_count;
-
-       /*
-        * This transaction is being forced and some process is
-        * waiting for it to finish.
-        */
-       unsigned int t_synchronous_commit:1;
 };
 
 /**
@@ -531,6 +525,8 @@ struct transaction_s
  *  transaction
  * @j_commit_request: Sequence number of the most recent transaction wanting
  *     commit
+ * @j_commit_waited: Sequence number of the most recent transaction someone
+ *     is waiting for to commit.
  * @j_uuid: Uuid of client object.
  * @j_task: Pointer to the current commit thread for this journal
  * @j_max_transaction_buffers:  Maximum number of metadata buffers to allow in a
@@ -695,6 +691,13 @@ struct journal_s
         */
        tid_t                   j_commit_request;
 
+       /*
+        * Sequence number of the most recent transaction someone is waiting
+        * for to commit.
+        * [j_state_lock]
+        */
+       tid_t                   j_commit_waited;
+
        /*
         * Journal uuid: identifies the object (filesystem, LVM volume etc)
         * backed by this journal.  This will eventually be replaced by an array
index aff64d82d713b44785240f8bc47697a336132bed..9305e1b5edc32c6ebe1b2e017788d5295f130d73 100644 (file)
@@ -36,19 +36,17 @@ DECLARE_EVENT_CLASS(jbd_commit,
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit             )
                __field(        int,    transaction             )
        ),
 
        TP_fast_assign(
                __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
        ),
 
-       TP_printk("dev %d,%d transaction %d sync %d",
+       TP_printk("dev %d,%d transaction %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->transaction, __entry->sync_commit)
+                 __entry->transaction)
 );
 
 DEFINE_EVENT(jbd_commit, jbd_start_commit,
@@ -87,19 +85,17 @@ TRACE_EVENT(jbd_drop_transaction,
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit             )
                __field(        int,    transaction             )
        ),
 
        TP_fast_assign(
                __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
        ),
 
-       TP_printk("dev %d,%d transaction %d sync %d",
+       TP_printk("dev %d,%d transaction %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->transaction, __entry->sync_commit)
+                 __entry->transaction)
 );
 
 TRACE_EVENT(jbd_end_commit,
@@ -109,21 +105,19 @@ TRACE_EVENT(jbd_end_commit,
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit             )
                __field(        int,    transaction             )
                __field(        int,    head                    )
        ),
 
        TP_fast_assign(
                __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
                __entry->head           = journal->j_tail_sequence;
        ),
 
-       TP_printk("dev %d,%d transaction %d sync %d head %d",
+       TP_printk("dev %d,%d transaction %d head %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->transaction, __entry->sync_commit, __entry->head)
+                 __entry->transaction, __entry->head)
 );
 
 TRACE_EVENT(jbd_do_submit_data,
@@ -133,19 +127,17 @@ TRACE_EVENT(jbd_do_submit_data,
 
        TP_STRUCT__entry(
                __field(        dev_t,  dev                     )
-               __field(        char,   sync_commit             )
                __field(        int,    transaction             )
        ),
 
        TP_fast_assign(
                __entry->dev            = journal->j_fs_dev->bd_dev;
-               __entry->sync_commit = commit_transaction->t_synchronous_commit;
                __entry->transaction    = commit_transaction->t_tid;
        ),
 
-       TP_printk("dev %d,%d transaction %d sync %d",
+       TP_printk("dev %d,%d transaction %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->transaction, __entry->sync_commit)
+                  __entry->transaction)
 );
 
 TRACE_EVENT(jbd_cleanup_journal_tail,