Btrfs: fix deadlock on async thread startup
authorChris Mason <chris.mason@oracle.com>
Fri, 2 Oct 2009 23:11:56 +0000 (19:11 -0400)
committerChris Mason <chris.mason@oracle.com>
Mon, 5 Oct 2009 13:44:45 +0000 (09:44 -0400)
The btrfs async worker threads are used for a wide variety of things,
including processing bio end_io functions.  This means that when
the endio threads aren't running, the rest of the FS isn't
able to do the final processing required to clear PageWriteback.

The endio threads also try to exit as they become idle and
start more as the work piles up.  The problem is that starting more
threads means kthreadd may need to allocate ram, and that allocation
may wait until the global number of writeback pages on the system is
below a certain limit.

The result of that throttling is that end IO threads wait on
kthreadd, who is waiting on IO to end, which will never happen.

This commit fixes the deadlock by handing off thread startup to a
dedicated thread.  It also fixes a bug where the on-demand thread
creation was creating far too many threads because it didn't take into
account threads being started by other procs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h
fs/btrfs/ctree.h
fs/btrfs/disk-io.c
fs/btrfs/relocation.c

index 282ca085c2fbff854bcf4a396d0773db129c5085..c0861e781cdbdfde4b7c42ec1b90032f2059e69a 100644 (file)
@@ -63,6 +63,51 @@ struct btrfs_worker_thread {
        int idle;
 };
 
+/*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time.  It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations.  This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+       struct btrfs_work work;
+       struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+       struct worker_start *start;
+       start = container_of(work, struct worker_start, work);
+       btrfs_start_workers(start->queue, 1);
+       kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+       struct worker_start *start;
+       int ret;
+
+       start = kzalloc(sizeof(*start), GFP_NOFS);
+       if (!start)
+               return -ENOMEM;
+
+       start->work.func = start_new_worker_func;
+       start->queue = queue;
+       ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+       if (ret)
+               kfree(start);
+       return ret;
+}
+
 /*
  * helper function to move a thread onto the idle list after it
  * has finished some requests.
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
                goto out;
 
        workers->atomic_start_pending = 0;
-       if (workers->num_workers >= workers->max_workers)
+       if (workers->num_workers + workers->num_workers_starting >=
+           workers->max_workers)
                goto out;
 
+       workers->num_workers_starting += 1;
        spin_unlock_irqrestore(&workers->lock, flags);
-       btrfs_start_workers(workers, 1);
+       start_new_worker(workers);
        return;
 
 out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 /*
  * simple init on struct btrfs_workers
  */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                       struct btrfs_workers *async_helper)
 {
        workers->num_workers = 0;
+       workers->num_workers_starting = 0;
        INIT_LIST_HEAD(&workers->worker_list);
        INIT_LIST_HEAD(&workers->idle_list);
        INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
        workers->name = name;
        workers->ordered = 0;
        workers->atomic_start_pending = 0;
-       workers->atomic_worker_start = 0;
+       workers->atomic_worker_start = async_helper;
 }
 
 /*
  * starts new worker threads.  This does not enforce the max worker
  * count in case you need to temporarily go past it.
  */
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+                                int num_workers)
 {
        struct btrfs_worker_thread *worker;
        int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
                list_add_tail(&worker->worker_list, &workers->idle_list);
                worker->idle = 1;
                workers->num_workers++;
+               workers->num_workers_starting--;
+               WARN_ON(workers->num_workers_starting < 0);
                spin_unlock_irq(&workers->lock);
        }
        return 0;
@@ -452,6 +504,14 @@ fail:
        return ret;
 }
 
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+       spin_lock_irq(&workers->lock);
+       workers->num_workers_starting += num_workers;
+       spin_unlock_irq(&workers->lock);
+       return __btrfs_start_workers(workers, num_workers);
+}
+
 /*
  * run through the list and find a worker thread that doesn't have a lot
  * to do right now.  This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
        struct btrfs_worker_thread *worker;
        struct list_head *next;
-       int enforce_min = workers->num_workers < workers->max_workers;
+       int enforce_min;
+
+       enforce_min = (workers->num_workers + workers->num_workers_starting) <
+               workers->max_workers;
 
        /*
         * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
        worker = next_worker(workers);
 
        if (!worker) {
-               if (workers->num_workers >= workers->max_workers) {
+               if (workers->num_workers + workers->num_workers_starting >=
+                   workers->max_workers) {
                        goto fallback;
                } else if (workers->atomic_worker_start) {
                        workers->atomic_start_pending = 1;
                        goto fallback;
                } else {
+                       workers->num_workers_starting++;
                        spin_unlock_irqrestore(&workers->lock, flags);
                        /* we're below the limit, start another worker */
-                       btrfs_start_workers(workers, 1);
+                       __btrfs_start_workers(workers, 1);
                        goto again;
                }
        }
index fc089b95ec14f24c9a971b44f3718c2029f241ef..5077746cf85e049e87bcd8ded49b592ecc271605 100644 (file)
@@ -64,6 +64,8 @@ struct btrfs_workers {
        /* current number of running workers */
        int num_workers;
 
+       int num_workers_starting;
+
        /* max number of workers allowed.  changed by btrfs_start_workers */
        int max_workers;
 
@@ -78,9 +80,10 @@ struct btrfs_workers {
 
        /*
         * are we allowed to sleep while starting workers or are we required
-        * to start them at a later time?
+        * to start them at a later time?  If we can't sleep, this indicates
+        * which queue we need to use to schedule thread creation.
         */
-       int atomic_worker_start;
+       struct btrfs_workers *atomic_worker_start;
 
        /* list with all the work threads.  The workers on the idle thread
         * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+                       struct btrfs_workers *async_starter);
 int btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
index 8184f2feb2f3f5ae8815b070e824c0ebc9330eaa..1b920ffc6a598047df4f9fffa7b59aad235cd836 100644 (file)
@@ -907,6 +907,7 @@ struct btrfs_fs_info {
         * A third pool does submit_bio to avoid deadlocking with the other
         * two
         */
+       struct btrfs_workers generic_worker;
        struct btrfs_workers workers;
        struct btrfs_workers delalloc_workers;
        struct btrfs_workers endio_workers;
index 69dce50aabd210ed669fc22246d26cfdc5856eb0..9903f042765d9fb158ecf3ddf02c5a0033d6606b 100644 (file)
@@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                err = -EINVAL;
                goto fail_iput;
        }
-printk("thread pool is %d\n", fs_info->thread_pool_size);
-       /*
-        * we need to start all the end_io workers up front because the
-        * queue work function gets called at interrupt time, and so it
-        * cannot dynamically grow.
-        */
+
+       btrfs_init_workers(&fs_info->generic_worker,
+                          "genwork", 1, NULL);
+
        btrfs_init_workers(&fs_info->workers, "worker",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        btrfs_init_workers(&fs_info->submit_workers, "submit",
                           min_t(u64, fs_devices->num_devices,
-                          fs_info->thread_pool_size));
+                          fs_info->thread_pool_size),
+                          &fs_info->generic_worker);
 
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
        fs_info->delalloc_workers.idle_thresh = 2;
        fs_info->delalloc_workers.ordered = 1;
 
-       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+       btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_workers, "endio",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_meta_write_workers,
-                          "endio-meta-write", fs_info->thread_pool_size);
+                          "endio-meta-write", fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size,
+                          &fs_info->generic_worker);
 
        /*
         * endios are largely parallel and should have a very
@@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
 
-       fs_info->endio_workers.atomic_worker_start = 1;
-       fs_info->endio_meta_workers.atomic_worker_start = 1;
-       fs_info->endio_write_workers.atomic_worker_start = 1;
-       fs_info->endio_meta_write_workers.atomic_worker_start = 1;
-
        btrfs_start_workers(&fs_info->workers, 1);
+       btrfs_start_workers(&fs_info->generic_worker, 1);
        btrfs_start_workers(&fs_info->submit_workers, 1);
        btrfs_start_workers(&fs_info->delalloc_workers, 1);
        btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -2012,6 +2014,7 @@ fail_chunk_root:
        free_extent_buffer(chunk_root->node);
        free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
+       btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
        btrfs_stop_workers(&fs_info->workers);
@@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root)
 
        iput(fs_info->btree_inode);
 
+       btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
        btrfs_stop_workers(&fs_info->delalloc_workers);
        btrfs_stop_workers(&fs_info->workers);
index 361ad323faaceb4ed3313bd1e2904ea29ea7d2f3..cfcc93c93a7b4db99b87ef3b91b30cc8d2f7e36b 100644 (file)
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        BUG_ON(!rc->block_group);
 
        btrfs_init_workers(&rc->workers, "relocate",
-                          fs_info->thread_pool_size);
+                          fs_info->thread_pool_size, NULL);
 
        rc->extent_root = extent_root;
        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        mapping_tree_init(&rc->reloc_root_tree);
        INIT_LIST_HEAD(&rc->reloc_roots);
        btrfs_init_workers(&rc->workers, "relocate",
-                          root->fs_info->thread_pool_size);
+                          root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
 
        set_reloc_control(rc);