From ff87e97a9d70c9ae133d3d3d7792b26ab85f4297 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:13:15 -0800 Subject: [PATCH] RDS: make m_rdma_op a member of rds_message This eliminates a separate memory alloc, although it is now necessary to add an "r_active" flag, since it is no longer to use the m_rdma_op pointer as an indicator of if an rdma op is present. rdma SGs allocated from rm sg pool. rds_rm_size also gets bigger. It's a little inefficient to run through CMSGs twice, but it makes later steps a lot smoother. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 20 ++++---- net/rds/iw_send.c | 16 +++---- net/rds/message.c | 9 ++-- net/rds/rdma.c | 113 ++++++++++++++++++++++++---------------------- net/rds/rds.h | 2 +- net/rds/send.c | 59 ++++++++++++++++++------ 6 files changed, 129 insertions(+), 90 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 575fce463c65..f0edfdb2866c 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -85,8 +85,8 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op) { - rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) { + rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -110,10 +110,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, */ rds_ib_send_rdma_complete(rm, wc_status); - if (rm->rdma.m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op.r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -243,8 +243,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_send_get_message(conn, send->s_op); if (rm) { - if (rm->rdma.m_rdma_op) - rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) + rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); rds_ib_send_rdma_complete(rm, wc.status); rds_message_put(rm); } @@ -560,10 +560,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op) { + if (rm->rdma.m_rdma_op.r_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -601,7 +601,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) send_flags = IB_SEND_FENCE; /* diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 62234b804d93..9b79a1b10445 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op) { - rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->rdma.m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op.r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op) { + if (rm->rdma.m_rdma_op.r_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) send_flags = IB_SEND_FENCE; /* diff --git a/net/rds/message.c b/net/rds/message.c index fb382fbb5b6f..4352ce79b376 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -69,8 +69,8 @@ static void rds_message_purge(struct rds_message *rm) } rm->data.m_nents = 0; - if (rm->rdma.m_rdma_op) - rds_rdma_free_op(rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) + rds_rdma_free_op(&rm->rdma.m_rdma_op); if (rm->rdma.m_rdma_mr) rds_mr_put(rm->rdma.m_rdma_mr); } @@ -259,14 +259,17 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in { struct rds_message *rm; unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + rm = rds_message_alloc(extra_bytes, GFP_KERNEL); if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.m_nents = ceil(total_len, PAGE_SIZE); + rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs); for (i = 0; i < rm->data.m_nents; ++i) { sg_set_page(&rm->data.m_sg[i], diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a21edad33950..7ff3379bab14 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -458,26 +458,60 @@ void rds_rdma_free_op(struct rds_rdma_op *ro) } kfree(ro->r_notifier); - kfree(ro); + ro->r_notifier = NULL; + ro->r_active = 0; +} + +/* + * Count the number of pages needed to describe an incoming iovec. + */ +static int rds_rdma_pages(struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_iovec __user *local_vec; + unsigned int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + return rds_rdma_pages(args) * sizeof(struct scatterlist); } /* * args is a pointer to an in-kernel copy in the sendmsg cmsg. */ -static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, - struct rds_rdma_args *args) +static int rds_rdma_prepare(struct rds_message *rm, + struct rds_sock *rs, + struct rds_rdma_args *args) { struct rds_iovec vec; - struct rds_rdma_op *op = NULL; + struct rds_rdma_op *op = &rm->rdma.m_rdma_op; unsigned int nr_pages; - unsigned int max_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec __user *local_vec; - struct scatterlist *sg; unsigned int nr; unsigned int i, j; - int ret; + int ret = 0; if (rs->rs_bound_addr == 0) { @@ -490,44 +524,21 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, goto out; } - nr_pages = 0; - max_pages = 0; - - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - - /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } - - max_pages = max(nr, max_pages); - nr_pages += nr; - } - - pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; + nr_pages = rds_rdma_pages(args); + if (nr_pages < 0) goto out; - } - op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (!op) { + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto out; } + op->r_sg = rds_message_alloc_sgs(rm, nr_pages); op->r_write = !!(args->flags & RDS_RDMA_READWRITE); op->r_fence = !!(args->flags & RDS_RDMA_FENCE); op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->r_active = 1; op->r_recverr = rs->rs_recverr; WARN_ON(!nr_pages); sg_init_table(op->r_sg, nr_pages); @@ -564,6 +575,8 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, (unsigned long long)args->remote_vec.addr, op->r_key); + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], sizeof(struct rds_iovec))) { @@ -580,11 +593,6 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, rs->rs_user_addr = vec.addr; rs->rs_user_bytes = vec.bytes; - /* did the user change the vec under us? */ - if (nr > max_pages || op->r_nents + nr > nr_pages) { - ret = -EINVAL; - goto out; - } /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ @@ -599,6 +607,7 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, for (j = 0; j < nr; j++) { unsigned int offset = vec.addr & ~PAGE_MASK; + struct scatterlist *sg; sg = &op->r_sg[op->r_nents + j]; sg_set_page(sg, pages[j], @@ -628,12 +637,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ret = 0; out: kfree(pages); - if (ret) { - if (op) - rds_rdma_free_op(op); - op = ERR_PTR(ret); - } - return op; + if (ret) + rds_rdma_free_op(op); + + return ret; } /* @@ -643,17 +650,17 @@ out: int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { - struct rds_rdma_op *op; + int ret; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->rdma.m_rdma_op) + rm->rdma.m_rdma_op.r_active) return -EINVAL; - op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); - if (IS_ERR(op)) - return PTR_ERR(op); + ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg)); + if (ret) + return ret; + rds_stats_inc(s_send_rdma); - rm->rdma.m_rdma_op = op; return 0; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 7c4adbe8c284..0bb4957e0cfc 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -316,7 +316,7 @@ struct rds_message { rds_rdma_cookie_t m_rdma_cookie; struct { struct { - struct rds_rdma_op *m_rdma_op; + struct rds_rdma_op m_rdma_op; struct rds_mr *m_rdma_mr; } rdma; struct { diff --git a/net/rds/send.c b/net/rds/send.c index 89e26ffdc812..72dbe7fc4f54 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -235,7 +235,7 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->rdma.m_rdma_op && + if (rm->rdma.m_rdma_op.r_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) @@ -267,8 +267,8 @@ int rds_send_xmit(struct rds_connection *conn) * keep this simple and require that the transport either * send the whole rdma or none of it. */ - if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op); if (ret) break; conn->c_xmit_rdma_sent = 1; @@ -418,9 +418,9 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->rdma.m_rdma_op; + ro = &rm->rdma.m_rdma_op; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro && ro->r_notify && ro->r_notifier) { + ro->r_active && ro->r_notify && ro->r_notifier) { notifier = ro->r_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -452,8 +452,8 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status { struct rds_rdma_op *ro; - ro = rm->rdma.m_rdma_op; - if (ro && ro->r_notify && ro->r_notifier) { + ro = &rm->rdma.m_rdma_op; + if (ro->r_active && ro->r_notify && ro->r_notifier) { ro->r_notifier->n_status = status; list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); ro->r_notifier = NULL; @@ -476,7 +476,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->rdma.m_rdma_op == op) { + if (&rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -484,7 +484,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->rdma.m_rdma_op == op) { + if (&rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -544,19 +544,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->rdma.m_rdma_op; + struct rds_rdma_op *ro = &rm->rdma.m_rdma_op; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro && ro->r_notifier && (status || ro->r_notify)) { + if (ro->r_active && ro->r_notifier && + (status || ro->r_notify)) { notifier = ro->r_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->rdma.m_rdma_op->r_notifier = NULL; + rm->rdma.m_rdma_op.r_notifier = NULL; } was_on_sock = 1; rm->m_rs = NULL; @@ -763,9 +764,37 @@ out: */ static int rds_rm_size(struct msghdr *msg, int data_len) { + struct cmsghdr *cmsg; int size = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + /* these are valid but do no add any size */ + break; + + default: + return -EINVAL; + } + + } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); return size; } @@ -896,11 +925,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) && + if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); + &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } -- 2.20.1