Btrfs: add a priority queue to the async thread helpers
authorChris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:50:09 +0000 (15:50 -0400)
committerChris Mason <chris.mason@oracle.com>
Mon, 20 Apr 2009 19:53:08 +0000 (15:53 -0400)
Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a
higher priority.  But, the checksumming helper threads prevent it
from being fully effective.

There are two problems.  First, a big queue of pending checksumming
will delay the synchronous IO behind other lower priority writes.  Second,
the checksumming uses an ordered async work queue.  The ordering makes sure
that IOs are sent to the block layer in the same order they are sent
to the checksumming threads.  Usually this gives us less seeky IO.

But, when we start mixing IO priorities, the lower priority IO can delay
the higher priority IO.

This patch solves both problems by adding a high priority list to the async
helper threads, and a new btrfs_set_work_high_prio(), which is used
to make put a new async work item onto the higher priority list.

The ordering is still done on high priority IO, but all of the high
priority bios are ordered separately from the low priority bios.  This
ordering is purely an IO optimization, it is not involved in data
or metadata integrity.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/disk-io.c
fs/btrfs/extent_io.c
fs/btrfs/file.c

index 51bfdfc8fcdac00b5f13599bed87922563a38f07..502c3d61de62509612312c6e29de442c605a33cb 100644 (file)
@@ -25,6 +25,7 @@
 #define WORK_QUEUED_BIT 0
 #define WORK_DONE_BIT 1
 #define WORK_ORDER_DONE_BIT 2
+#define WORK_HIGH_PRIO_BIT 3
 
 /*
  * container for the kthread task pointer and the list of pending work
@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
 
        /* list of struct btrfs_work that are waiting for service */
        struct list_head pending;
+       struct list_head prio_pending;
 
        /* list of worker threads from struct btrfs_workers */
        struct list_head worker_list;
@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
 
        spin_lock_irqsave(&workers->lock, flags);
 
-       while (!list_empty(&workers->order_list)) {
-               work = list_entry(workers->order_list.next,
-                                 struct btrfs_work, order_list);
-
+       while (1) {
+               if (!list_empty(&workers->prio_order_list)) {
+                       work = list_entry(workers->prio_order_list.next,
+                                         struct btrfs_work, order_list);
+               } else if (!list_empty(&workers->order_list)) {
+                       work = list_entry(workers->order_list.next,
+                                         struct btrfs_work, order_list);
+               } else {
+                       break;
+               }
                if (!test_bit(WORK_DONE_BIT, &work->flags))
                        break;
 
@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
        do {
                spin_lock_irq(&worker->lock);
 again_locked:
-               while (!list_empty(&worker->pending)) {
-                       cur = worker->pending.next;
+               while (1) {
+                       if (!list_empty(&worker->prio_pending))
+                               cur = worker->prio_pending.next;
+                       else if (!list_empty(&worker->pending))
+                               cur = worker->pending.next;
+                       else
+                               break;
+
                        work = list_entry(cur, struct btrfs_work, list);
                        list_del(&work->list);
                        clear_bit(WORK_QUEUED_BIT, &work->flags);
@@ -163,7 +177,6 @@ again_locked:
 
                        spin_lock_irq(&worker->lock);
                        check_idle_worker(worker);
-
                }
                if (freezing(current)) {
                        worker->working = 0;
@@ -178,7 +191,8 @@ again_locked:
                                 * jump_in?
                                 */
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                /*
@@ -191,7 +205,8 @@ again_locked:
                                 */
                                schedule_timeout(1);
                                smp_mb();
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        continue;
 
                                if (kthread_should_stop())
@@ -200,7 +215,8 @@ again_locked:
                                /* still no more work?, sleep for real */
                                spin_lock_irq(&worker->lock);
                                set_current_state(TASK_INTERRUPTIBLE);
-                               if (!list_empty(&worker->pending))
+                               if (!list_empty(&worker->pending) ||
+                                   !list_empty(&worker->prio_pending))
                                        goto again_locked;
 
                                /*
@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
+       INIT_LIST_HEAD(&workers->prio_order_list);
        spin_lock_init(&workers->lock);
        workers->max_workers = max;
        workers->idle_thresh = 32;
@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                }
 
                INIT_LIST_HEAD(&worker->pending);
+               INIT_LIST_HEAD(&worker->prio_pending);
                INIT_LIST_HEAD(&worker->worker_list);
                spin_lock_init(&worker->lock);
                atomic_set(&worker->num_pending, 0);
@@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work)
                goto out;
 
        spin_lock_irqsave(&worker->lock, flags);
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
 
        /* by definition we're busy, take ourselves off the idle
@@ -422,6 +443,11 @@ out:
        return 0;
 }
 
+void btrfs_set_work_high_prio(struct btrfs_work *work)
+{
+       set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+}
+
 /*
  * places a struct btrfs_work into the pending queue of one of the kthreads
  */
@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
        worker = find_worker(workers);
        if (workers->ordered) {
                spin_lock_irqsave(&workers->lock, flags);
-               list_add_tail(&work->order_list, &workers->order_list);
+               if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
+                       list_add_tail(&work->order_list,
+                                     &workers->prio_order_list);
+               } else {
+                       list_add_tail(&work->order_list, &workers->order_list);
+               }
                spin_unlock_irqrestore(&workers->lock, flags);
        } else {
                INIT_LIST_HEAD(&work->order_list);
@@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 
        spin_lock_irqsave(&worker->lock, flags);
 
-       list_add_tail(&work->list, &worker->pending);
+       if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
+               list_add_tail(&work->list, &worker->prio_pending);
+       else
+               list_add_tail(&work->list, &worker->pending);
        atomic_inc(&worker->num_pending);
        check_busy_worker(worker);
 
index 31be4ed8b63ea0aec80b2c31781fad2ef57786f6..1b511c109db658ef1d772bb90d77b795f780e63d 100644 (file)
@@ -85,6 +85,7 @@ struct btrfs_workers {
         * of work items waiting for completion
         */
        struct list_head order_list;
+       struct list_head prio_order_list;
 
        /* lock for finding the next worker thread to queue on */
        spinlock_t lock;
@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
 int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
index fec18b43c2c37eff80ae7d0c932ce7ad2f280251..a6b83744b05da6184caff93080cff1eeab1159b9 100644 (file)
@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->bio_flags = bio_flags;
 
        atomic_inc(&fs_info->nr_async_submits);
+
+       if (rw & (1 << BIO_RW_SYNCIO))
+               btrfs_set_work_high_prio(&async->work);
+
        btrfs_queue_worker(&fs_info->workers, &async->work);
 #if 0
        int limit = btrfs_async_submit_limit(fs_info);
@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
                                     mirror_num, 0);
        }
+
        /*
         * kthread helpers are used to submit writes so that checksumming
         * can happen in parallel across all CPUs
index 483b6727aaafdcc6c98f1567a0fae4379ac48750..5d66cb27e422d91dd74818a81abd00ca96667674 100644 (file)
@@ -2501,7 +2501,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
        };
        struct writeback_control wbc_writepages = {
                .bdi            = wbc->bdi,
-               .sync_mode      = WB_SYNC_NONE,
+               .sync_mode      = wbc->sync_mode,
                .older_than_this = NULL,
                .nr_to_write    = 64,
                .range_start    = page_offset(page) + PAGE_CACHE_SIZE,
index 9c9fb46ccd08f9fe7eaf9d22991738f9231810c2..e21c0060ee7304c00f7e111dd96573e9c56b6cbd 100644 (file)
@@ -1131,7 +1131,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                if (will_write) {
                        btrfs_fdatawrite_range(inode->i_mapping, pos,
                                               pos + write_bytes - 1,
-                                              WB_SYNC_NONE);
+                                              WB_SYNC_ALL);
                } else {
                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
                                                           num_pages);