bdi: add a user-tunable cpu_list for the bdi flusher threads
authorJeff Moyer <jmoyer@redhat.com>
Wed, 5 Dec 2012 19:17:21 +0000 (20:17 +0100)
committerJens Axboe <axboe@kernel.dk>
Wed, 5 Dec 2012 19:17:21 +0000 (20:17 +0100)
In realtime environments, it may be desirable to keep the per-bdi
flusher threads from running on certain cpus.  This patch adds a
cpu_list file to /sys/class/bdi/* to enable this.  The default is to tie
the flusher threads to the same numa node as the backing device (though
I could be convinced to make it a mask of all cpus to avoid a change in
behaviour).

Thanks to Jeremy Eder for the original idea.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/linux/backing-dev.h
mm/backing-dev.c

index 2a9a9abc91260c09a7940136a965a08209c5828b..238521a19849b2865d26aaffd9cfe1e7739ad70a 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
+#include <linux/mutex.h>
 
 struct page;
 struct device;
@@ -105,6 +106,9 @@ struct backing_dev_info {
 
        struct timer_list laptop_mode_wb_timer;
 
+       cpumask_t *flusher_cpumask; /* used for writeback thread scheduling */
+       struct mutex flusher_cpumask_lock;
+
 #ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
        struct dentry *debug_stats;
index d3ca2b3ee17657f8045c4dca291721ac6c38c9be..bd6a6cabef7148490eb56ea426c3c4e6a7745013 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 #include <trace/events/writeback.h>
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -221,12 +222,63 @@ static ssize_t max_ratio_store(struct device *dev,
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+static ssize_t cpu_list_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct backing_dev_info *bdi = dev_get_drvdata(dev);
+       struct bdi_writeback *wb = &bdi->wb;
+       cpumask_var_t newmask;
+       ssize_t ret;
+       struct task_struct *task;
+
+       if (!alloc_cpumask_var(&newmask, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = cpulist_parse(buf, newmask);
+       if (!ret) {
+               spin_lock_bh(&bdi->wb_lock);
+               task = wb->task;
+               if (task)
+                       get_task_struct(task);
+               spin_unlock_bh(&bdi->wb_lock);
+
+               mutex_lock(&bdi->flusher_cpumask_lock);
+               if (task) {
+                       ret = set_cpus_allowed_ptr(task, newmask);
+                       put_task_struct(task);
+               }
+               if (ret == 0) {
+                       cpumask_copy(bdi->flusher_cpumask, newmask);
+                       ret = count;
+               }
+               mutex_unlock(&bdi->flusher_cpumask_lock);
+
+       }
+       free_cpumask_var(newmask);
+
+       return ret;
+}
+
+static ssize_t cpu_list_show(struct device *dev,
+               struct device_attribute *attr, char *page)
+{
+       struct backing_dev_info *bdi = dev_get_drvdata(dev);
+       ssize_t ret;
+
+       mutex_lock(&bdi->flusher_cpumask_lock);
+       ret = cpulist_scnprintf(page, PAGE_SIZE-1, bdi->flusher_cpumask);
+       mutex_unlock(&bdi->flusher_cpumask_lock);
+
+       return ret;
+}
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
        __ATTR_RW(read_ahead_kb),
        __ATTR_RW(min_ratio),
        __ATTR_RW(max_ratio),
+       __ATTR_RW(cpu_list),
        __ATTR_NULL,
 };
 
@@ -428,6 +480,7 @@ static int bdi_forker_thread(void *ptr)
                                writeback_inodes_wb(&bdi->wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                        } else {
+                               int ret;
                                /*
                                 * The spinlock makes sure we do not lose
                                 * wake-ups when racing with 'bdi_queue_work()'.
@@ -437,6 +490,14 @@ static int bdi_forker_thread(void *ptr)
                                spin_lock_bh(&bdi->wb_lock);
                                bdi->wb.task = task;
                                spin_unlock_bh(&bdi->wb_lock);
+                               mutex_lock(&bdi->flusher_cpumask_lock);
+                               ret = set_cpus_allowed_ptr(task,
+                                                       bdi->flusher_cpumask);
+                               mutex_unlock(&bdi->flusher_cpumask_lock);
+                               if (ret)
+                                       printk_once("%s: failed to bind flusher"
+                                                   " thread %s, error %d\n",
+                                                   __func__, task->comm, ret);
                                wake_up_process(task);
                        }
                        bdi_clear_pending(bdi);
@@ -509,6 +570,17 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                                                dev_name(dev));
                if (IS_ERR(wb->task))
                        return PTR_ERR(wb->task);
+       } else {
+               int node;
+               /*
+                * Set up a default cpumask for the flusher threads that
+                * includes all cpus on the same numa node as the device.
+                * The mask may be overridden via sysfs.
+                */
+               node = dev_to_node(bdi->dev);
+               if (node != NUMA_NO_NODE)
+                       cpumask_copy(bdi->flusher_cpumask,
+                                    cpumask_of_node(node));
        }
 
        bdi_debug_register(bdi, dev_name(dev));
@@ -634,6 +706,15 @@ int bdi_init(struct backing_dev_info *bdi)
 
        bdi_wb_init(&bdi->wb, bdi);
 
+       if (!bdi_cap_flush_forker(bdi)) {
+               bdi->flusher_cpumask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+               if (!bdi->flusher_cpumask)
+                       return -ENOMEM;
+               cpumask_setall(bdi->flusher_cpumask);
+               mutex_init(&bdi->flusher_cpumask_lock);
+       } else
+               bdi->flusher_cpumask = NULL;
+
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
                err = percpu_counter_init(&bdi->bdi_stat[i], 0);
                if (err)
@@ -656,6 +737,7 @@ int bdi_init(struct backing_dev_info *bdi)
 err:
                while (i--)
                        percpu_counter_destroy(&bdi->bdi_stat[i]);
+               kfree(bdi->flusher_cpumask);
        }
 
        return err;
@@ -683,6 +765,8 @@ void bdi_destroy(struct backing_dev_info *bdi)
 
        bdi_unregister(bdi);
 
+       kfree(bdi->flusher_cpumask);
+
        /*
         * If bdi_unregister() had already been called earlier, the
         * wakeup_timer could still be armed because bdi_prune_sb()