IB/ehca: Fix race condition/locking issues in scaling code
authorHoang-Nam Nguyen <hnguyen@linux.vnet.ibm.com>
Thu, 15 Feb 2007 16:07:30 +0000 (17:07 +0100)
committerRoland Dreier <rolandd@cisco.com>
Fri, 16 Feb 2007 21:57:34 +0000 (13:57 -0800)
Fix a race condition in find_next_cpu_online() and some other locking
issues in ehca scaling code.

Signed-off-by: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/ehca/ehca_irq.c

index b923b5d5de68d263321b91254c2a9d80697ab79f..9679b072ad013bffbb265375535802f44d39839a 100644 (file)
@@ -544,28 +544,30 @@ void ehca_tasklet_eq(unsigned long data)
 
 static inline int find_next_online_cpu(struct ehca_comp_pool* pool)
 {
-       unsigned long flags_last_cpu;
+       int cpu;
+       unsigned long flags;
 
+       WARN_ON_ONCE(!in_interrupt());
        if (ehca_debug_level)
                ehca_dmp(&cpu_online_map, sizeof(cpumask_t), "");
 
-       spin_lock_irqsave(&pool->last_cpu_lock, flags_last_cpu);
-       pool->last_cpu = next_cpu(pool->last_cpu, cpu_online_map);
-       if (pool->last_cpu == NR_CPUS)
-               pool->last_cpu = first_cpu(cpu_online_map);
-       spin_unlock_irqrestore(&pool->last_cpu_lock, flags_last_cpu);
+       spin_lock_irqsave(&pool->last_cpu_lock, flags);
+       cpu = next_cpu(pool->last_cpu, cpu_online_map);
+       if (cpu == NR_CPUS)
+               cpu = first_cpu(cpu_online_map);
+       pool->last_cpu = cpu;
+       spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
 
-       return pool->last_cpu;
+       return cpu;
 }
 
 static void __queue_comp_task(struct ehca_cq *__cq,
                              struct ehca_cpu_comp_task *cct)
 {
-       unsigned long flags_cct;
-       unsigned long flags_cq;
+       unsigned long flags;
 
-       spin_lock_irqsave(&cct->task_lock, flags_cct);
-       spin_lock_irqsave(&__cq->task_lock, flags_cq);
+       spin_lock_irqsave(&cct->task_lock, flags);
+       spin_lock(&__cq->task_lock);
 
        if (__cq->nr_callbacks == 0) {
                __cq->nr_callbacks++;
@@ -576,8 +578,8 @@ static void __queue_comp_task(struct ehca_cq *__cq,
        else
                __cq->nr_callbacks++;
 
-       spin_unlock_irqrestore(&__cq->task_lock, flags_cq);
-       spin_unlock_irqrestore(&cct->task_lock, flags_cct);
+       spin_unlock(&__cq->task_lock);
+       spin_unlock_irqrestore(&cct->task_lock, flags);
 }
 
 static void queue_comp_task(struct ehca_cq *__cq)
@@ -588,69 +590,69 @@ static void queue_comp_task(struct ehca_cq *__cq)
 
        cpu = get_cpu();
        cpu_id = find_next_online_cpu(pool);
-
        BUG_ON(!cpu_online(cpu_id));
 
        cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+       BUG_ON(!cct);
 
        if (cct->cq_jobs > 0) {
                cpu_id = find_next_online_cpu(pool);
                cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+               BUG_ON(!cct);
        }
 
        __queue_comp_task(__cq, cct);
-
-       put_cpu();
-
-       return;
 }
 
 static void run_comp_task(struct ehca_cpu_comp_task* cct)
 {
        struct ehca_cq *cq;
-       unsigned long flags_cct;
-       unsigned long flags_cq;
+       unsigned long flags;
 
-       spin_lock_irqsave(&cct->task_lock, flags_cct);
+       spin_lock_irqsave(&cct->task_lock, flags);
 
        while (!list_empty(&cct->cq_list)) {
                cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
-               spin_unlock_irqrestore(&cct->task_lock, flags_cct);
+               spin_unlock_irqrestore(&cct->task_lock, flags);
                comp_event_callback(cq);
-               spin_lock_irqsave(&cct->task_lock, flags_cct);
+               spin_lock_irqsave(&cct->task_lock, flags);
 
-               spin_lock_irqsave(&cq->task_lock, flags_cq);
+               spin_lock(&cq->task_lock);
                cq->nr_callbacks--;
                if (cq->nr_callbacks == 0) {
                        list_del_init(cct->cq_list.next);
                        cct->cq_jobs--;
                }
-               spin_unlock_irqrestore(&cq->task_lock, flags_cq);
-
+               spin_unlock(&cq->task_lock);
        }
 
-       spin_unlock_irqrestore(&cct->task_lock, flags_cct);
-
-       return;
+       spin_unlock_irqrestore(&cct->task_lock, flags);
 }
 
 static int comp_task(void *__cct)
 {
        struct ehca_cpu_comp_task* cct = __cct;
+       int cql_empty;
        DECLARE_WAITQUEUE(wait, current);
 
        set_current_state(TASK_INTERRUPTIBLE);
        while(!kthread_should_stop()) {
                add_wait_queue(&cct->wait_queue, &wait);
 
-               if (list_empty(&cct->cq_list))
+               spin_lock_irq(&cct->task_lock);
+               cql_empty = list_empty(&cct->cq_list);
+               spin_unlock_irq(&cct->task_lock);
+               if (cql_empty)
                        schedule();
                else
                        __set_current_state(TASK_RUNNING);
 
                remove_wait_queue(&cct->wait_queue, &wait);
 
-               if (!list_empty(&cct->cq_list))
+               spin_lock_irq(&cct->task_lock);
+               cql_empty = list_empty(&cct->cq_list);
+               spin_unlock_irq(&cct->task_lock);
+               if (!cql_empty)
                        run_comp_task(__cct);
 
                set_current_state(TASK_INTERRUPTIBLE);
@@ -693,8 +695,6 @@ static void destroy_comp_task(struct ehca_comp_pool *pool,
 
        if (task)
                kthread_stop(task);
-
-       return;
 }
 
 static void take_over_work(struct ehca_comp_pool *pool,
@@ -815,6 +815,4 @@ void ehca_destroy_comp_pool(void)
        free_percpu(pool->cpu_comp_tasks);
        kfree(pool);
 #endif
-
-       return;
 }