/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
*/
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+ /*
+ * The following mb guarantees that previous clear of a PENDING bit
+ * will not be reordered with any speculative LOADS or STORES from
+ * work->current_func, which is executed afterwards. This possible
+ * reordering can lead to a missed execution on attempt to qeueue
+ * the same @work. E.g. consider this case:
+ *
+ * CPU#0 CPU#1
+ * ---------------------------- --------------------------------
+ *
+ * 1 STORE event_indicated
+ * 2 queue_work_on() {
+ * 3 test_and_set_bit(PENDING)
+ * 4 } set_..._and_clear_pending() {
+ * 5 set_work_data() # clear bit
+ * 6 smp_mb()
+ * 7 work->current_func() {
+ * 8 LOAD event_indicated
+ * }
+ *
+ * Without an explicit full barrier speculative LOAD on line 8 can
+ * be executed before CPU#0 does STORE on line 1. If that happens,
+ * CPU#0 observes the PENDING bit is still set and new execution of
+ * a @work is not queued in a hope, that CPU#1 will eventually
+ * finish the queued @work. Meanwhile CPU#1 does not see
+ * event_indicated is set, because speculative LOAD was executed
+ * before actual STORE.
+ */
+ smp_mb();
}
static void clear_work_data(struct work_struct *work)
if (worker->flags & WORKER_IDLE)
pool->nr_idle--;
+ /*
+ * Once WORKER_DIE is set, the kworker may destroy itself at any
+ * point. Pin to ensure the task stays until we're done with it.
+ */
+ get_task_struct(worker->task);
+
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
+ put_task_struct(worker->task);
kfree(worker);
spin_lock_irq(&pool->lock);
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
- *
- * RETURNS:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
*/
-static bool maybe_create_worker(struct worker_pool *pool)
+static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
if (!need_to_create_worker(pool))
- return false;
+ return;
restart:
spin_unlock_irq(&pool->lock);
start_worker(worker);
if (WARN_ON_ONCE(need_to_create_worker(pool)))
goto restart;
- return true;
+ return;
}
if (!need_to_create_worker(pool))
spin_lock_irq(&pool->lock);
if (need_to_create_worker(pool))
goto restart;
- return true;
+ return;
}
/**
* LOCKING:
* spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Called only from manager.
- *
- * RETURNS:
- * %false if no action was taken and pool->lock stayed locked, %true
- * otherwise.
*/
-static bool maybe_destroy_workers(struct worker_pool *pool)
+static void maybe_destroy_workers(struct worker_pool *pool)
{
- bool ret = false;
-
while (too_many_workers(pool)) {
struct worker *worker;
unsigned long expires;
}
destroy_worker(worker);
- ret = true;
}
-
- return ret;
}
/**
* multiple times. Does GFP_KERNEL allocations.
*
* RETURNS:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
- * multiple times. Does GFP_KERNEL allocations.
+ * %false if the pool doesn't need management and the caller can safely
+ * start processing works, %true if management function was performed and
+ * the conditions that the caller verified before calling the function may
+ * no longer be true.
*/
static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- bool ret = false;
/*
* Managership is governed by two mutexes - manager_arb and
* manager_mutex.
*/
if (!mutex_trylock(&pool->manager_arb))
- return ret;
+ return false;
/*
* With manager arbitration won, manager_mutex would be free in
spin_unlock_irq(&pool->lock);
mutex_lock(&pool->manager_mutex);
spin_lock_irq(&pool->lock);
- ret = true;
}
pool->flags &= ~POOL_MANAGE_WORKERS;
* Destroy and then create so that may_start_working() is true
* on return.
*/
- ret |= maybe_destroy_workers(pool);
- ret |= maybe_create_worker(pool);
+ maybe_destroy_workers(pool);
+ maybe_create_worker(pool);
mutex_unlock(&pool->manager_mutex);
mutex_unlock(&pool->manager_arb);
- return ret;
+ return true;
}
/**
dump_stack();
}
+ /*
+ * The following prevents a kworker from hogging CPU on !PREEMPT
+ * kernels, where a requeueing work item waiting for something to
+ * happen could deadlock with stop_machine as such work item could
+ * indefinitely requeue itself while all other CPUs are trapped in
+ * stop_machine.
+ */
+ cond_resched();
+
spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
process_scheduled_works(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
/*
* Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
}
EXPORT_SYMBOL_GPL(flush_work);
+struct cwt_wait {
+ wait_queue_t wait;
+ struct work_struct *work;
+};
+
+static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
+
+ if (cwait->work != key)
+ return 0;
+ return autoremove_wake_function(wait, mode, sync, key);
+}
+
static bool __cancel_work_timer(struct work_struct *work, bool is_dwork)
{
+ static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq);
unsigned long flags;
int ret;
do {
ret = try_to_grab_pending(work, is_dwork, &flags);
/*
- * If someone else is canceling, wait for the same event it
- * would be waiting for before retrying.
+ * If someone else is already canceling, wait for it to
+ * finish. flush_work() doesn't work for PREEMPT_NONE
+ * because we may get scheduled between @work's completion
+ * and the other canceling task resuming and clearing
+ * CANCELING - flush_work() will return false immediately
+ * as @work is no longer busy, try_to_grab_pending() will
+ * return -ENOENT as @work is still being canceled and the
+ * other canceling task won't be able to clear CANCELING as
+ * we're hogging the CPU.
+ *
+ * Let's wait for completion using a waitqueue. As this
+ * may lead to the thundering herd problem, use a custom
+ * wake function which matches @work along with exclusive
+ * wait and wakeup.
*/
- if (unlikely(ret == -ENOENT))
- flush_work(work);
+ if (unlikely(ret == -ENOENT)) {
+ struct cwt_wait cwait;
+
+ init_wait(&cwait.wait);
+ cwait.wait.func = cwt_wakefn;
+ cwait.work = work;
+
+ prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait,
+ TASK_UNINTERRUPTIBLE);
+ if (work_is_canceling(work))
+ schedule();
+ finish_wait(&cancel_waitq, &cwait.wait);
+ }
} while (unlikely(ret < 0));
/* tell other tasks trying to grab @work to back off */
flush_work(work);
clear_work_data(work);
+
+ /*
+ * Paired with prepare_to_wait() above so that either
+ * waitqueue_active() is visible here or !work_is_canceling() is
+ * visible there.
+ */
+ smp_mb();
+ if (waitqueue_active(&cancel_waitq))
+ __wake_up(&cancel_waitq, TASK_NORMAL, 1, work);
+
return ret;
}
}
}
+ dev_set_uevent_suppress(&wq_dev->dev, false);
kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
return 0;
}
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ /*
+ * Unlike hash and equality test, this function doesn't ignore
+ * ->no_numa as it is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears ->no_numa after copying.
+ */
+ to->no_numa = from->no_numa;
}
/* hash value of the content of @attr */
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
+ /*
+ * no_numa isn't a worker_pool attribute, always clear it. See
+ * 'struct workqueue_attrs' comments for detail.
+ */
+ pool->attrs->no_numa = false;
+
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- goto out_unlock;
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
- int cpu;
+ int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
mutex_unlock(&wq->mutex);
}
return 0;
+ } else if (wq->flags & __WQ_ORDERED) {
+ ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ /* there should only be single pwq for ordering guarantee */
+ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ "ordering guarantee broken for workqueue %s\n", wq->name);
+ return ret;
} else {
return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
BUG_ON(!tbl);
for_each_node(node)
- BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+ BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
node_online(node) ? node : NUMA_NO_NODE));
for_each_possible_cpu(cpu) {
}
}
- /* create default unbound wq attrs */
+ /* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
+
+ /*
+ * An ordered wq should have only one pwq as ordering is
+ * guaranteed by max_active which is enforced by pwqs.
+ * Turn off NUMA so that dfl_pwq is used for all nodes.
+ */
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ attrs->no_numa = true;
+ ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);