IB/cm: Mark stale CM id's whenever the mad agent was unregistered
authorMark Bloch <markb@mellanox.com>
Thu, 27 Oct 2016 13:36:27 +0000 (16:36 +0300)
committerWilly Tarreau <w@1wt.eu>
Fri, 10 Feb 2017 10:03:38 +0000 (11:03 +0100)
commit 9db0ff53cb9b43ed75bacd42a89c1a0ab048b2b0 upstream.

When there is a CM id object that has port assigned to it, it means that
the cm-id asked for the specific port that it should go by it, but if
that port was removed (hot-unplug event) the cm-id was not updated.
In order to fix that the port keeps a list of all the cm-id's that are
planning to go by it, whenever the port is removed it marks all of them
as invalid.

This commit fixes a kernel panic which happens when running traffic between
guests and we force reboot a guest mid traffic, it triggers a kernel panic:

 Call Trace:
  [<ffffffff815271fa>] ? panic+0xa7/0x16f
  [<ffffffff8152b534>] ? oops_end+0xe4/0x100
  [<ffffffff8104a00b>] ? no_context+0xfb/0x260
  [<ffffffff81084db2>] ? del_timer_sync+0x22/0x30
  [<ffffffff8104a295>] ? __bad_area_nosemaphore+0x125/0x1e0
  [<ffffffff81084240>] ? process_timeout+0x0/0x10
  [<ffffffff8104a363>] ? bad_area_nosemaphore+0x13/0x20
  [<ffffffff8104aabf>] ? __do_page_fault+0x31f/0x480
  [<ffffffff81065df0>] ? default_wake_function+0x0/0x20
  [<ffffffffa0752675>] ? free_msg+0x55/0x70 [mlx5_core]
  [<ffffffffa0753434>] ? cmd_exec+0x124/0x840 [mlx5_core]
  [<ffffffff8105a924>] ? find_busiest_group+0x244/0x9f0
  [<ffffffff8152d45e>] ? do_page_fault+0x3e/0xa0
  [<ffffffff8152a815>] ? page_fault+0x25/0x30
  [<ffffffffa024da25>] ? cm_alloc_msg+0x35/0xc0 [ib_cm]
  [<ffffffffa024e821>] ? ib_send_cm_dreq+0xb1/0x1e0 [ib_cm]
  [<ffffffffa024f836>] ? cm_destroy_id+0x176/0x320 [ib_cm]
  [<ffffffffa024fb00>] ? ib_destroy_cm_id+0x10/0x20 [ib_cm]
  [<ffffffffa034f527>] ? ipoib_cm_free_rx_reap_list+0xa7/0x110 [ib_ipoib]
  [<ffffffffa034f590>] ? ipoib_cm_rx_reap+0x0/0x20 [ib_ipoib]
  [<ffffffffa034f5a5>] ? ipoib_cm_rx_reap+0x15/0x20 [ib_ipoib]
  [<ffffffff81094d20>] ? worker_thread+0x170/0x2a0
  [<ffffffff8109b2a0>] ? autoremove_wake_function+0x0/0x40
  [<ffffffff81094bb0>] ? worker_thread+0x0/0x2a0
  [<ffffffff8109aef6>] ? kthread+0x96/0xa0
  [<ffffffff8100c20a>] ? child_rip+0xa/0x20
  [<ffffffff8109ae60>] ? kthread+0x0/0xa0
  [<ffffffff8100c200>] ? child_rip+0x0/0x20

Fixes: a977049dacde ("[PATCH] IB: Add the kernel CM implementation")
Signed-off-by: Mark Bloch <markb@mellanox.com>
Signed-off-by: Erez Shitrit <erezsh@mellanox.com>
Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Willy Tarreau <w@1wt.eu>
drivers/infiniband/core/cm.c

index c410217fbe890344197d51ac4e46a9056bb7db2a..951a4f6a3b11ad5c5d819d2c44fc1a3361726aad 100644 (file)
@@ -79,6 +79,8 @@ static struct ib_cm {
        __be32 random_id_operand;
        struct list_head timewait_list;
        struct workqueue_struct *wq;
+       /* Sync on cm change port state */
+       spinlock_t state_lock;
 } cm;
 
 /* Counter indexes ordered by attribute ID */
@@ -160,6 +162,8 @@ struct cm_port {
        struct ib_mad_agent *mad_agent;
        struct kobject port_obj;
        u8 port_num;
+       struct list_head cm_priv_prim_list;
+       struct list_head cm_priv_altr_list;
        struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
 };
 
@@ -237,6 +241,12 @@ struct cm_id_private {
        u8 service_timeout;
        u8 target_ack_delay;
 
+       struct list_head prim_list;
+       struct list_head altr_list;
+       /* Indicates that the send port mad is registered and av is set */
+       int prim_send_port_not_ready;
+       int altr_send_port_not_ready;
+
        struct list_head work_list;
        atomic_t work_count;
 };
@@ -255,19 +265,46 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
        struct ib_mad_agent *mad_agent;
        struct ib_mad_send_buf *m;
        struct ib_ah *ah;
+       struct cm_av *av;
+       unsigned long flags, flags2;
+       int ret = 0;
 
+       /* don't let the port to be released till the agent is down */
+       spin_lock_irqsave(&cm.state_lock, flags2);
+       spin_lock_irqsave(&cm.lock, flags);
+       if (!cm_id_priv->prim_send_port_not_ready)
+               av = &cm_id_priv->av;
+       else if (!cm_id_priv->altr_send_port_not_ready &&
+                (cm_id_priv->alt_av.port))
+               av = &cm_id_priv->alt_av;
+       else {
+               pr_info("%s: not valid CM id\n", __func__);
+               ret = -ENODEV;
+               spin_unlock_irqrestore(&cm.lock, flags);
+               goto out;
+       }
+       spin_unlock_irqrestore(&cm.lock, flags);
+       /* Make sure the port haven't released the mad yet */
        mad_agent = cm_id_priv->av.port->mad_agent;
-       ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
-       if (IS_ERR(ah))
-               return PTR_ERR(ah);
+       if (!mad_agent) {
+               pr_info("%s: not a valid MAD agent\n", __func__);
+               ret = -ENODEV;
+               goto out;
+       }
+       ah = ib_create_ah(mad_agent->qp->pd, &av->ah_attr);
+       if (IS_ERR(ah)) {
+               ret = PTR_ERR(ah);
+               goto out;
+       }
 
        m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
-                              cm_id_priv->av.pkey_index,
+                              av->pkey_index,
                               0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
                               GFP_ATOMIC);
        if (IS_ERR(m)) {
                ib_destroy_ah(ah);
-               return PTR_ERR(m);
+               ret = PTR_ERR(m);
+               goto out;
        }
 
        /* Timeout set by caller if response is expected. */
@@ -277,7 +314,10 @@ static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
        atomic_inc(&cm_id_priv->refcount);
        m->context[0] = cm_id_priv;
        *msg = m;
-       return 0;
+
+out:
+       spin_unlock_irqrestore(&cm.state_lock, flags2);
+       return ret;
 }
 
 static int cm_alloc_response_msg(struct cm_port *port,
@@ -346,7 +386,8 @@ static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
                           grh, &av->ah_attr);
 }
 
-static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av,
+                             struct cm_id_private *cm_id_priv)
 {
        struct cm_device *cm_dev;
        struct cm_port *port = NULL;
@@ -376,7 +417,18 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
        ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
                             &av->ah_attr);
        av->timeout = path->packet_life_time + 1;
-       return 0;
+
+       spin_lock_irqsave(&cm.lock, flags);
+       if (&cm_id_priv->av == av)
+               list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list);
+       else if (&cm_id_priv->alt_av == av)
+               list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list);
+       else
+               ret = -EINVAL;
+
+       spin_unlock_irqrestore(&cm.lock, flags);
+
+       return ret;
 }
 
 static int cm_alloc_id(struct cm_id_private *cm_id_priv)
@@ -716,6 +768,8 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
        spin_lock_init(&cm_id_priv->lock);
        init_completion(&cm_id_priv->comp);
        INIT_LIST_HEAD(&cm_id_priv->work_list);
+       INIT_LIST_HEAD(&cm_id_priv->prim_list);
+       INIT_LIST_HEAD(&cm_id_priv->altr_list);
        atomic_set(&cm_id_priv->work_count, -1);
        atomic_set(&cm_id_priv->refcount, 1);
        return &cm_id_priv->id;
@@ -914,6 +968,15 @@ retest:
                break;
        }
 
+       spin_lock_irq(&cm.lock);
+       if (!list_empty(&cm_id_priv->altr_list) &&
+           (!cm_id_priv->altr_send_port_not_ready))
+               list_del(&cm_id_priv->altr_list);
+       if (!list_empty(&cm_id_priv->prim_list) &&
+           (!cm_id_priv->prim_send_port_not_ready))
+               list_del(&cm_id_priv->prim_list);
+       spin_unlock_irq(&cm.lock);
+
        cm_free_id(cm_id->local_id);
        cm_deref_id(cm_id_priv);
        wait_for_completion(&cm_id_priv->comp);
@@ -1137,12 +1200,13 @@ int ib_send_cm_req(struct ib_cm_id *cm_id,
                goto out;
        }
 
-       ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+       ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av,
+                                cm_id_priv);
        if (ret)
                goto error1;
        if (param->alternate_path) {
                ret = cm_init_av_by_path(param->alternate_path,
-                                        &cm_id_priv->alt_av);
+                                        &cm_id_priv->alt_av, cm_id_priv);
                if (ret)
                        goto error1;
        }
@@ -1562,7 +1626,8 @@ static int cm_req_handler(struct cm_work *work)
 
        cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
        cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
-       ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+       ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av,
+                                cm_id_priv);
        if (ret) {
                ib_get_cached_gid(work->port->cm_dev->ib_device,
                                  work->port->port_num, 0, &work->path[0].sgid);
@@ -1572,7 +1637,8 @@ static int cm_req_handler(struct cm_work *work)
                goto rejected;
        }
        if (req_msg->alt_local_lid) {
-               ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+               ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av,
+                                        cm_id_priv);
                if (ret) {
                        ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
                                       &work->path[0].sgid,
@@ -2627,7 +2693,8 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id,
                goto out;
        }
 
-       ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+       ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av,
+                                cm_id_priv);
        if (ret)
                goto out;
        cm_id_priv->alt_av.timeout =
@@ -2739,7 +2806,8 @@ static int cm_lap_handler(struct cm_work *work)
        cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
                                work->mad_recv_wc->recv_buf.grh,
                                &cm_id_priv->av);
-       cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+       cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av,
+                          cm_id_priv);
        ret = atomic_inc_and_test(&cm_id_priv->work_count);
        if (!ret)
                list_add_tail(&work->list, &cm_id_priv->work_list);
@@ -2931,7 +2999,7 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
                return -EINVAL;
 
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
-       ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+       ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv);
        if (ret)
                goto out;
 
@@ -3352,7 +3420,9 @@ out:
 static int cm_migrate(struct ib_cm_id *cm_id)
 {
        struct cm_id_private *cm_id_priv;
+       struct cm_av tmp_av;
        unsigned long flags;
+       int tmp_send_port_not_ready;
        int ret = 0;
 
        cm_id_priv = container_of(cm_id, struct cm_id_private, id);
@@ -3361,7 +3431,14 @@ static int cm_migrate(struct ib_cm_id *cm_id)
            (cm_id->lap_state == IB_CM_LAP_UNINIT ||
             cm_id->lap_state == IB_CM_LAP_IDLE)) {
                cm_id->lap_state = IB_CM_LAP_IDLE;
+               /* Swap address vector */
+               tmp_av = cm_id_priv->av;
                cm_id_priv->av = cm_id_priv->alt_av;
+               cm_id_priv->alt_av = tmp_av;
+               /* Swap port send ready state */
+               tmp_send_port_not_ready = cm_id_priv->prim_send_port_not_ready;
+               cm_id_priv->prim_send_port_not_ready = cm_id_priv->altr_send_port_not_ready;
+               cm_id_priv->altr_send_port_not_ready = tmp_send_port_not_ready;
        } else
                ret = -EINVAL;
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -3767,6 +3844,9 @@ static void cm_add_one(struct ib_device *ib_device)
                port->cm_dev = cm_dev;
                port->port_num = i;
 
+               INIT_LIST_HEAD(&port->cm_priv_prim_list);
+               INIT_LIST_HEAD(&port->cm_priv_altr_list);
+
                ret = cm_create_port_fs(port);
                if (ret)
                        goto error1;
@@ -3813,6 +3893,8 @@ static void cm_remove_one(struct ib_device *ib_device)
 {
        struct cm_device *cm_dev;
        struct cm_port *port;
+       struct cm_id_private *cm_id_priv;
+       struct ib_mad_agent *cur_mad_agent;
        struct ib_port_modify port_modify = {
                .clr_port_cap_mask = IB_PORT_CM_SUP
        };
@@ -3830,10 +3912,22 @@ static void cm_remove_one(struct ib_device *ib_device)
        for (i = 1; i <= ib_device->phys_port_cnt; i++) {
                port = cm_dev->port[i-1];
                ib_modify_port(ib_device, port->port_num, 0, &port_modify);
-               ib_unregister_mad_agent(port->mad_agent);
+               /* Mark all the cm_id's as not valid */
+               spin_lock_irq(&cm.lock);
+               list_for_each_entry(cm_id_priv, &port->cm_priv_altr_list, altr_list)
+                       cm_id_priv->altr_send_port_not_ready = 1;
+               list_for_each_entry(cm_id_priv, &port->cm_priv_prim_list, prim_list)
+                       cm_id_priv->prim_send_port_not_ready = 1;
+               spin_unlock_irq(&cm.lock);
                flush_workqueue(cm.wq);
+               spin_lock_irq(&cm.state_lock);
+               cur_mad_agent = port->mad_agent;
+               port->mad_agent = NULL;
+               spin_unlock_irq(&cm.state_lock);
+               ib_unregister_mad_agent(cur_mad_agent);
                cm_remove_port_fs(port);
        }
+
        device_unregister(cm_dev->device);
        kfree(cm_dev);
 }
@@ -3846,6 +3940,7 @@ static int __init ib_cm_init(void)
        INIT_LIST_HEAD(&cm.device_list);
        rwlock_init(&cm.device_lock);
        spin_lock_init(&cm.lock);
+       spin_lock_init(&cm.state_lock);
        cm.listen_service_table = RB_ROOT;
        cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
        cm.remote_id_table = RB_ROOT;