IB/core: Add rdma_network_type to wc
authorSomnath Kotur <Somnath.Kotur@Avagotech.Com>
Wed, 23 Dec 2015 12:56:51 +0000 (14:56 +0200)
committerDoug Ledford <dledford@redhat.com>
Wed, 23 Dec 2015 15:35:11 +0000 (10:35 -0500)
Providers should tell IB core the wc's network type.
This is used in order to search for the proper GID in the
GID table. When using HCAs that can't provide this info,
IB core tries to deep examine the packet and extract
the GID type by itself.

We choose sgid_index and type from all the matching entries in
RDMA-CM based on hint from the IP stack and we set hop_limit for
the IP packet based on above hint from IP stack.

Signed-off-by: Matan Barak <matanb@mellanox.com>
Signed-off-by: Somnath Kotur <Somnath.Kotur@Avagotech.Com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
drivers/infiniband/core/addr.c
drivers/infiniband/core/cma.c
drivers/infiniband/core/verbs.c
include/rdma/ib_addr.h
include/rdma/ib_verbs.h

index 34b1adad07aacf92e9bbfa9621e084f2ec756f74..6e35299e00e46c76f0f2eb2ba267ba78e3de496d 100644 (file)
@@ -256,6 +256,12 @@ static int addr4_resolve(struct sockaddr_in *src_in,
                goto put;
        }
 
+       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+        * routable) and we could set the network type accordingly.
+        */
+       if (rt->rt_uses_gateway)
+               addr->network = RDMA_NETWORK_IPV4;
+
        ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
 put:
        ip_rt_put(rt);
@@ -270,6 +276,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 {
        struct flowi6 fl6;
        struct dst_entry *dst;
+       struct rt6_info *rt;
        int ret;
 
        memset(&fl6, 0, sizeof fl6);
@@ -281,6 +288,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
        if ((ret = dst->error))
                goto put;
 
+       rt = (struct rt6_info *)dst;
        if (ipv6_addr_any(&fl6.saddr)) {
                ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev,
                                         &fl6.daddr, 0, &fl6.saddr);
@@ -304,6 +312,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
                goto put;
        }
 
+       /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't
+        * routable) and we could set the network type accordingly.
+        */
+       if (rt->rt6i_flags & RTF_GATEWAY)
+               addr->network = RDMA_NETWORK_IPV6;
+
        ret = dst_fetch_ha(dst, addr, &fl6.daddr);
 put:
        dst_release(dst);
index 446323a9d38fb620643bc624a4ff4887739d96be..0a29d6083b77c4702febcc2f7123fd23de57bbf1 100644 (file)
@@ -2291,6 +2291,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 {
        struct rdma_route *route = &id_priv->id.route;
        struct rdma_addr *addr = &route->addr;
+       enum ib_gid_type network_gid_type;
        struct cma_work *work;
        int ret;
        struct net_device *ndev = NULL;
@@ -2329,7 +2330,15 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
                    &route->path_rec->dgid);
 
-       route->path_rec->hop_limit = 1;
+       /* Use the hint from IP Stack to select GID Type */
+       network_gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+       if (addr->dev_addr.network != RDMA_NETWORK_IB) {
+               route->path_rec->gid_type = network_gid_type;
+               /* TODO: get the hoplimit from the inet/inet6 device */
+               route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT;
+       } else {
+               route->path_rec->hop_limit = 1;
+       }
        route->path_rec->reversible = 1;
        route->path_rec->pkey = cpu_to_be16(0xffff);
        route->path_rec->mtu_selector = IB_SA_EQ;
index e397d8bb4bdec5bd5e8e624761ff2976d2a69cd7..b25e5fbac5a4d9d213386922a165ca6e20cedf02 100644 (file)
@@ -305,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
+static int ib_get_header_version(const union rdma_network_hdr *hdr)
+{
+       const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh;
+       struct iphdr ip4h_checked;
+       const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh;
+
+       /* If it's IPv6, the version must be 6, otherwise, the first
+        * 20 bytes (before the IPv4 header) are garbled.
+        */
+       if (ip6h->version != 6)
+               return (ip4h->version == 4) ? 4 : 0;
+       /* version may be 6 or 4 because the first 20 bytes could be garbled */
+
+       /* RoCE v2 requires no options, thus header length
+        * must be 5 words
+        */
+       if (ip4h->ihl != 5)
+               return 6;
+
+       /* Verify checksum.
+        * We can't write on scattered buffers so we need to copy to
+        * temp buffer.
+        */
+       memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+       ip4h_checked.check = 0;
+       ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+       /* if IPv4 header checksum is OK, believe it */
+       if (ip4h->check == ip4h_checked.check)
+               return 4;
+       return 6;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+                                                    u8 port_num,
+                                                    const struct ib_grh *grh)
+{
+       int grh_version;
+
+       if (rdma_protocol_ib(device, port_num))
+               return RDMA_NETWORK_IB;
+
+       grh_version = ib_get_header_version((union rdma_network_hdr *)grh);
+
+       if (grh_version == 4)
+               return RDMA_NETWORK_IPV4;
+
+       if (grh->next_hdr == IPPROTO_UDP)
+               return RDMA_NETWORK_IPV6;
+
+       return RDMA_NETWORK_ROCE_V1;
+}
+
 struct find_gid_index_context {
        u16 vlan_id;
+       enum ib_gid_type gid_type;
 };
 
 static bool find_gid_index(const union ib_gid *gid,
@@ -316,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid,
        struct find_gid_index_context *ctx =
                (struct find_gid_index_context *)context;
 
+       if (ctx->gid_type != gid_attr->gid_type)
+               return false;
+
        if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
            (is_vlan_dev(gid_attr->ndev) &&
             vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -326,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid,
 
 static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
                                   u16 vlan_id, const union ib_gid *sgid,
+                                  enum ib_gid_type gid_type,
                                   u16 *gid_index)
 {
-       struct find_gid_index_context context = {.vlan_id = vlan_id};
+       struct find_gid_index_context context = {.vlan_id = vlan_id,
+                                                .gid_type = gid_type};
 
        return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
                                     &context, gid_index);
 }
 
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+                                 enum rdma_network_type net_type,
+                                 union ib_gid *sgid, union ib_gid *dgid)
+{
+       struct sockaddr_in  src_in;
+       struct sockaddr_in  dst_in;
+       __be32 src_saddr, dst_saddr;
+
+       if (!sgid || !dgid)
+               return -EINVAL;
+
+       if (net_type == RDMA_NETWORK_IPV4) {
+               memcpy(&src_in.sin_addr.s_addr,
+                      &hdr->roce4grh.saddr, 4);
+               memcpy(&dst_in.sin_addr.s_addr,
+                      &hdr->roce4grh.daddr, 4);
+               src_saddr = src_in.sin_addr.s_addr;
+               dst_saddr = dst_in.sin_addr.s_addr;
+               ipv6_addr_set_v4mapped(src_saddr,
+                                      (struct in6_addr *)sgid);
+               ipv6_addr_set_v4mapped(dst_saddr,
+                                      (struct in6_addr *)dgid);
+               return 0;
+       } else if (net_type == RDMA_NETWORK_IPV6 ||
+                  net_type == RDMA_NETWORK_IB) {
+               *dgid = hdr->ibgrh.dgid;
+               *sgid = hdr->ibgrh.sgid;
+               return 0;
+       } else {
+               return -EINVAL;
+       }
+}
+
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
                       const struct ib_wc *wc, const struct ib_grh *grh,
                       struct ib_ah_attr *ah_attr)
@@ -341,9 +432,25 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
        u32 flow_class;
        u16 gid_index;
        int ret;
+       enum rdma_network_type net_type = RDMA_NETWORK_IB;
+       enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+       union ib_gid dgid;
+       union ib_gid sgid;
 
        memset(ah_attr, 0, sizeof *ah_attr);
        if (rdma_cap_eth_ah(device, port_num)) {
+               if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+                       net_type = wc->network_hdr_type;
+               else
+                       net_type = ib_get_net_type_by_grh(device, port_num, grh);
+               gid_type = ib_network_to_gid_type(net_type);
+       }
+       ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+                                    &sgid, &dgid);
+       if (ret)
+               return ret;
+
+       if (rdma_protocol_roce(device, port_num)) {
                u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
                                wc->vlan_id : 0xffff;
 
@@ -352,7 +459,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 
                if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
                    !(wc->wc_flags & IB_WC_WITH_VLAN)) {
-                       ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+                       ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid,
                                                         ah_attr->dmac,
                                                         wc->wc_flags & IB_WC_WITH_VLAN ?
                                                         NULL : &vlan_id,
@@ -362,7 +469,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
                }
 
                ret = get_sgid_index_from_eth(device, port_num, vlan_id,
-                                             &grh->dgid, &gid_index);
+                                             &dgid, gid_type, &gid_index);
                if (ret)
                        return ret;
 
@@ -377,10 +484,10 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num,
 
        if (wc->wc_flags & IB_WC_GRH) {
                ah_attr->ah_flags = IB_AH_GRH;
-               ah_attr->grh.dgid = grh->sgid;
+               ah_attr->grh.dgid = sgid;
 
                if (!rdma_cap_eth_ah(device, port_num)) {
-                       ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+                       ret = ib_find_cached_gid_by_port(device, &dgid,
                                                         IB_GID_TYPE_IB,
                                                         port_num, NULL,
                                                         &gid_index);
@@ -1020,6 +1127,12 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
                                        ret = -ENXIO;
                                goto out;
                        }
+                       if (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+                               /* TODO: get the hoplimit from the inet/inet6
+                                * device
+                                */
+                               qp_attr->ah_attr.grh.hop_limit =
+                                                       IPV6_DEFAULT_HOPLIMIT;
 
                        ifindex = sgid_attr.ndev->ifindex;
 
index 11528591d0d714f3ea1004566cb799952a9ca5bd..c799caacf353333d1997c60eb5c7cf0a83823f2d 100644 (file)
@@ -83,6 +83,7 @@ struct rdma_dev_addr {
        int bound_dev_if;
        enum rdma_transport_type transport;
        struct net *net;
+       enum rdma_network_type network;
 };
 
 /**
index ab05ef695d633ef9bc54eda98de9c793cb1e96e8..368fc22f30f191548aebc398691daed8ee19d39f 100644 (file)
@@ -51,6 +51,8 @@
 #include <linux/socket.h>
 #include <linux/irq_poll.h>
 #include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
+#include <net/ip.h>
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
@@ -109,6 +111,35 @@ enum rdma_protocol_type {
 __attribute_const__ enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type);
 
+enum rdma_network_type {
+       RDMA_NETWORK_IB,
+       RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB,
+       RDMA_NETWORK_IPV4,
+       RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+       if (network_type == RDMA_NETWORK_IPV4 ||
+           network_type == RDMA_NETWORK_IPV6)
+               return IB_GID_TYPE_ROCE_UDP_ENCAP;
+
+       /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */
+       return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+                                                           union ib_gid *gid)
+{
+       if (gid_type == IB_GID_TYPE_IB)
+               return RDMA_NETWORK_IB;
+
+       if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+               return RDMA_NETWORK_IPV4;
+       else
+               return RDMA_NETWORK_IPV6;
+}
+
 enum rdma_link_layer {
        IB_LINK_LAYER_UNSPECIFIED,
        IB_LINK_LAYER_INFINIBAND,
@@ -537,6 +568,17 @@ struct ib_grh {
        union ib_gid    dgid;
 };
 
+union rdma_network_hdr {
+       struct ib_grh ibgrh;
+       struct {
+               /* The IB spec states that if it's IPv4, the header
+                * is located in the last 20 bytes of the header.
+                */
+               u8              reserved[20];
+               struct iphdr    roce4grh;
+       };
+};
+
 enum {
        IB_MULTICAST_QPN = 0xffffff
 };
@@ -773,6 +815,7 @@ enum ib_wc_flags {
        IB_WC_IP_CSUM_OK        = (1<<3),
        IB_WC_WITH_SMAC         = (1<<4),
        IB_WC_WITH_VLAN         = (1<<5),
+       IB_WC_WITH_NETWORK_HDR_TYPE     = (1<<6),
 };
 
 struct ib_wc {
@@ -798,6 +841,7 @@ struct ib_wc {
        u8                      port_num;       /* valid only for DR SMPs on switches */
        u8                      smac[ETH_ALEN];
        u16                     vlan_id;
+       u8                      network_hdr_type;
 };
 
 enum ib_cq_notify_flags {