From: Jack Morgenstein Date: Sat, 4 Mar 2006 05:54:13 +0000 (-0800) Subject: IB/umad: Add support for large RMPP transfers X-Git-Tag: v2.6.17-rc1~1180^2~10 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f36e1793e25513380cae5958a9164d4cc4458ad0;p=linux-2.6 IB/umad: Add support for large RMPP transfers Add support for sending and receiving large RMPP transfers. The old code supports transfers only as large as a single contiguous kernel memory allocation. This patch uses linked list of memory buffers when sending and receiving data to avoid needing contiguous pages for larger transfers. Receive side: copy the arriving MADs in chunks instead of coalescing to one large buffer in kernel space. Send side: split a multipacket MAD buffer to a list of segments, (multipacket_list) and send these using a gather list of size 2. Also, save pointer to last sent segment, and retrieve requested segments by walking list starting at last sent segment. Finally, save pointer to last-acked segment. When retrying, retrieve segments for resending relative to this pointer. When updating last ack, start at this pointer. Signed-off-by: Jack Morgenstein Signed-off-by: Sean Hefty Signed-off-by: Roland Dreier --- diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 445ad0dda2..16549add8e 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -31,7 +31,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: mad.c 2817 2005-07-07 11:29:26Z halr $ + * $Id: mad.c 5596 2006-03-03 01:00:07Z sean.hefty $ */ #include @@ -765,18 +765,67 @@ out: return ret; } -static int get_buf_length(int hdr_len, int data_len) +static int get_pad_size(int hdr_len, int data_len) { int seg_size, pad; seg_size = sizeof(struct ib_mad) - hdr_len; if (data_len && seg_size) { pad = seg_size - data_len % seg_size; - if (pad == seg_size) - pad = 0; + return pad == seg_size ? 0 : pad; } else - pad = seg_size; - return hdr_len + data_len + pad; + return seg_size; +} + +static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr) +{ + struct ib_rmpp_segment *s, *t; + + list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) { + list_del(&s->list); + kfree(s); + } +} + +static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, + gfp_t gfp_mask) +{ + struct ib_mad_send_buf *send_buf = &send_wr->send_buf; + struct ib_rmpp_mad *rmpp_mad = send_buf->mad; + struct ib_rmpp_segment *seg = NULL; + int left, seg_size, pad; + + send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len; + seg_size = send_buf->seg_size; + pad = send_wr->pad; + + /* Allocate data segments. */ + for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { + seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); + if (!seg) { + printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " + "alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); + free_send_rmpp_list(send_wr); + return -ENOMEM; + } + seg->num = ++send_buf->seg_count; + list_add_tail(&seg->list, &send_wr->rmpp_list); + } + + /* Zero any padding */ + if (pad) + memset(seg->data + seg_size - pad, 0, pad); + + rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv-> + agent.rmpp_version; + rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; + ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); + + send_wr->cur_seg = container_of(send_wr->rmpp_list.next, + struct ib_rmpp_segment, list); + send_wr->last_ack_seg = send_wr->cur_seg; + return 0; } struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, @@ -787,32 +836,40 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, { struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_wr_private *mad_send_wr; - int buf_size; + int pad, message_size, ret, size; void *buf; mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private, agent); - buf_size = get_buf_length(hdr_len, data_len); + pad = get_pad_size(hdr_len, data_len); + message_size = hdr_len + data_len + pad; if ((!mad_agent->rmpp_version && - (rmpp_active || buf_size > sizeof(struct ib_mad))) || - (!rmpp_active && buf_size > sizeof(struct ib_mad))) + (rmpp_active || message_size > sizeof(struct ib_mad))) || + (!rmpp_active && message_size > sizeof(struct ib_mad))) return ERR_PTR(-EINVAL); - buf = kzalloc(sizeof *mad_send_wr + buf_size, gfp_mask); + size = rmpp_active ? hdr_len : sizeof(struct ib_mad); + buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); if (!buf) return ERR_PTR(-ENOMEM); - mad_send_wr = buf + buf_size; + mad_send_wr = buf + size; + INIT_LIST_HEAD(&mad_send_wr->rmpp_list); mad_send_wr->send_buf.mad = buf; + mad_send_wr->send_buf.hdr_len = hdr_len; + mad_send_wr->send_buf.data_len = data_len; + mad_send_wr->pad = pad; mad_send_wr->mad_agent_priv = mad_agent_priv; - mad_send_wr->sg_list[0].length = buf_size; + mad_send_wr->sg_list[0].length = hdr_len; mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey; + mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len; + mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey; mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr; mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list; - mad_send_wr->send_wr.num_sge = 1; + mad_send_wr->send_wr.num_sge = 2; mad_send_wr->send_wr.opcode = IB_WR_SEND; mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED; mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn; @@ -820,13 +877,11 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index; if (rmpp_active) { - struct ib_rmpp_mad *rmpp_mad = mad_send_wr->send_buf.mad; - rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(hdr_len - - IB_MGMT_RMPP_HDR + data_len); - rmpp_mad->rmpp_hdr.rmpp_version = mad_agent->rmpp_version; - rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA; - ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, - IB_MGMT_RMPP_FLAG_ACTIVE); + ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask); + if (ret) { + kfree(buf); + return ERR_PTR(ret); + } } mad_send_wr->send_buf.mad_agent = mad_agent; @@ -835,14 +890,50 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, } EXPORT_SYMBOL(ib_create_send_mad); +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num) +{ + struct ib_mad_send_wr_private *mad_send_wr; + struct list_head *list; + + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + list = &mad_send_wr->cur_seg->list; + + if (mad_send_wr->cur_seg->num < seg_num) { + list_for_each_entry(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } else if (mad_send_wr->cur_seg->num > seg_num) { + list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list) + if (mad_send_wr->cur_seg->num == seg_num) + break; + } + return mad_send_wr->cur_seg->data; +} +EXPORT_SYMBOL(ib_get_rmpp_segment); + +static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr) +{ + if (mad_send_wr->send_buf.seg_count) + return ib_get_rmpp_segment(&mad_send_wr->send_buf, + mad_send_wr->seg_num); + else + return mad_send_wr->send_buf.mad + + mad_send_wr->send_buf.hdr_len; +} + void ib_free_send_mad(struct ib_mad_send_buf *send_buf) { struct ib_mad_agent_private *mad_agent_priv; + struct ib_mad_send_wr_private *mad_send_wr; mad_agent_priv = container_of(send_buf->mad_agent, struct ib_mad_agent_private, agent); - kfree(send_buf->mad); + mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private, + send_buf); + free_send_rmpp_list(mad_send_wr); + kfree(send_buf->mad); if (atomic_dec_and_test(&mad_agent_priv->refcount)) wake_up(&mad_agent_priv->wait); } @@ -865,10 +956,17 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; - sge->addr = dma_map_single(mad_agent->device->dma_device, - mad_send_wr->send_buf.mad, sge->length, - DMA_TO_DEVICE); - pci_unmap_addr_set(mad_send_wr, mapping, sge->addr); + sge[0].addr = dma_map_single(mad_agent->device->dma_device, + mad_send_wr->send_buf.mad, + sge[0].length, + DMA_TO_DEVICE); + pci_unmap_addr_set(mad_send_wr, header_mapping, sge[0].addr); + + sge[1].addr = dma_map_single(mad_agent->device->dma_device, + ib_get_payload(mad_send_wr), + sge[1].length, + DMA_TO_DEVICE); + pci_unmap_addr_set(mad_send_wr, payload_mapping, sge[1].addr); spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { @@ -885,11 +983,14 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) list_add_tail(&mad_send_wr->mad_list.list, list); } spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); - if (ret) + if (ret) { dma_unmap_single(mad_agent->device->dma_device, - pci_unmap_addr(mad_send_wr, mapping), - sge->length, DMA_TO_DEVICE); - + pci_unmap_addr(mad_send_wr, header_mapping), + sge[0].length, DMA_TO_DEVICE); + dma_unmap_single(mad_agent->device->dma_device, + pci_unmap_addr(mad_send_wr, payload_mapping), + sge[1].length, DMA_TO_DEVICE); + } return ret; } @@ -1860,8 +1961,11 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, retry: dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device, - pci_unmap_addr(mad_send_wr, mapping), + pci_unmap_addr(mad_send_wr, header_mapping), mad_send_wr->sg_list[0].length, DMA_TO_DEVICE); + dma_unmap_single(mad_send_wr->send_buf.mad_agent->device->dma_device, + pci_unmap_addr(mad_send_wr, payload_mapping), + mad_send_wr->sg_list[1].length, DMA_TO_DEVICE); queued_send_wr = NULL; spin_lock_irqsave(&send_queue->lock, flags); list_del(&mad_list->list); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 570f78682a..a7125d4b5c 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -31,7 +31,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: mad_priv.h 2730 2005-06-28 16:43:03Z sean.hefty $ + * $Id: mad_priv.h 5596 2006-03-03 01:00:07Z sean.hefty $ */ #ifndef __IB_MAD_PRIV_H__ @@ -85,6 +85,12 @@ struct ib_mad_private { } mad; } __attribute__ ((packed)); +struct ib_rmpp_segment { + struct list_head list; + u32 num; + u8 data[0]; +}; + struct ib_mad_agent_private { struct list_head agent_list; struct ib_mad_agent agent; @@ -119,7 +125,8 @@ struct ib_mad_send_wr_private { struct list_head agent_list; struct ib_mad_agent_private *mad_agent_priv; struct ib_mad_send_buf send_buf; - DECLARE_PCI_UNMAP_ADDR(mapping) + DECLARE_PCI_UNMAP_ADDR(header_mapping) + DECLARE_PCI_UNMAP_ADDR(payload_mapping) struct ib_send_wr send_wr; struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG]; __be64 tid; @@ -130,11 +137,12 @@ struct ib_mad_send_wr_private { enum ib_wc_status status; /* RMPP control */ + struct list_head rmpp_list; + struct ib_rmpp_segment *last_ack_seg; + struct ib_rmpp_segment *cur_seg; int last_ack; int seg_num; int newwin; - int total_seg; - int data_offset; int pad; }; diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c index 3249e1d8c0..bacfdd5bdd 100644 --- a/drivers/infiniband/core/mad_rmpp.c +++ b/drivers/infiniband/core/mad_rmpp.c @@ -111,14 +111,14 @@ static int data_offset(u8 mgmt_class) return IB_MGMT_RMPP_HDR; } -static void format_ack(struct ib_rmpp_mad *ack, +static void format_ack(struct ib_mad_send_buf *msg, struct ib_rmpp_mad *data, struct mad_rmpp_recv *rmpp_recv) { + struct ib_rmpp_mad *ack = msg->mad; unsigned long flags; - memcpy(&ack->mad_hdr, &data->mad_hdr, - data_offset(data->mad_hdr.mgmt_class)); + memcpy(ack, &data->mad_hdr, msg->hdr_len); ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP; ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK; @@ -135,16 +135,16 @@ static void ack_recv(struct mad_rmpp_recv *rmpp_recv, struct ib_mad_recv_wc *recv_wc) { struct ib_mad_send_buf *msg; - int ret; + int ret, hdr_len; + hdr_len = data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp, - recv_wc->wc->pkey_index, 1, IB_MGMT_RMPP_HDR, - IB_MGMT_RMPP_DATA, GFP_KERNEL); + recv_wc->wc->pkey_index, 1, hdr_len, + 0, GFP_KERNEL); if (!msg) return; - format_ack(msg->mad, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, - rmpp_recv); + format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv); msg->ah = rmpp_recv->ah; ret = ib_post_send_mad(msg, NULL); if (ret) @@ -156,16 +156,17 @@ static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent, { struct ib_mad_send_buf *msg; struct ib_ah *ah; + int hdr_len; ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc, recv_wc->recv_buf.grh, agent->port_num); if (IS_ERR(ah)) return (void *) ah; + hdr_len = data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class); msg = ib_create_send_mad(agent, recv_wc->wc->src_qp, recv_wc->wc->pkey_index, 1, - IB_MGMT_RMPP_HDR, IB_MGMT_RMPP_DATA, - GFP_KERNEL); + hdr_len, 0, GFP_KERNEL); if (IS_ERR(msg)) ib_destroy_ah(ah); else @@ -195,8 +196,7 @@ static void nack_recv(struct ib_mad_agent_private *agent, return; rmpp_mad = msg->mad; - memcpy(rmpp_mad, recv_wc->recv_buf.mad, - data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class)); + memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len); rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP; rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION; @@ -433,44 +433,6 @@ static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv) return rmpp_wc; } -void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf) -{ - struct ib_mad_recv_buf *seg_buf; - struct ib_rmpp_mad *rmpp_mad; - void *data; - int size, len, offset; - u8 flags; - - len = mad_recv_wc->mad_len; - if (len <= sizeof(struct ib_mad)) { - memcpy(buf, mad_recv_wc->recv_buf.mad, len); - return; - } - - offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class); - - list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) { - rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad; - flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr); - - if (flags & IB_MGMT_RMPP_FLAG_FIRST) { - data = rmpp_mad; - size = sizeof(*rmpp_mad); - } else { - data = (void *) rmpp_mad + offset; - if (flags & IB_MGMT_RMPP_FLAG_LAST) - size = len; - else - size = sizeof(*rmpp_mad) - offset; - } - - memcpy(buf, data, size); - len -= size; - buf += size; - } -} -EXPORT_SYMBOL(ib_coalesce_recv_mad); - static struct ib_mad_recv_wc * continue_rmpp(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) @@ -570,50 +532,33 @@ start_rmpp(struct ib_mad_agent_private *agent, return mad_recv_wc; } -static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr) -{ - return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset + - (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) * - (mad_send_wr->seg_num - 1); -} - static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; int timeout; - u32 paylen; + u32 paylen = 0; rmpp_mad = mad_send_wr->send_buf.mad; ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE); - rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(mad_send_wr->seg_num); + rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num); if (mad_send_wr->seg_num == 1) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST; - paylen = mad_send_wr->total_seg * IB_MGMT_RMPP_DATA - + paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA - mad_send_wr->pad; - rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); - mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad); - } else { - mad_send_wr->send_wr.num_sge = 2; - mad_send_wr->sg_list[0].length = mad_send_wr->data_offset; - mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr); - mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) - - mad_send_wr->data_offset; - mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey; - rmpp_mad->rmpp_hdr.paylen_newwin = 0; } - if (mad_send_wr->seg_num == mad_send_wr->total_seg) { + if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) { rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST; paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad; - rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); } + rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen); /* 2 seconds for an ACK until we can find the packet lifetime */ timeout = mad_send_wr->send_buf.timeout_ms; if (!timeout || timeout > 2000) mad_send_wr->timeout = msecs_to_jiffies(2000); - mad_send_wr->seg_num++; + return ib_send_mad(mad_send_wr); } @@ -629,7 +574,7 @@ static void abort_send(struct ib_mad_agent_private *agent, __be64 tid, if (!mad_send_wr) goto out; /* Unmatched send */ - if ((mad_send_wr->last_ack == mad_send_wr->total_seg) || + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) goto out; /* Send is already done */ @@ -645,6 +590,18 @@ out: spin_unlock_irqrestore(&agent->lock, flags); } +static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr, + int seg_num) +{ + struct list_head *list; + + wr->last_ack = seg_num; + list = &wr->last_ack_seg->list; + list_for_each_entry(wr->last_ack_seg, list, list) + if (wr->last_ack_seg->num == seg_num) + break; +} + static void process_rmpp_ack(struct ib_mad_agent_private *agent, struct ib_mad_recv_wc *mad_recv_wc) { @@ -675,11 +632,12 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, if (!mad_send_wr) goto out; /* Unmatched ACK */ - if ((mad_send_wr->last_ack == mad_send_wr->total_seg) || + if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) || (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS)) goto out; /* Send is already done */ - if (seg_num > mad_send_wr->total_seg || seg_num > mad_send_wr->newwin) { + if (seg_num > mad_send_wr->send_buf.seg_count || + seg_num > mad_send_wr->newwin) { spin_unlock_irqrestore(&agent->lock, flags); abort_send(agent, rmpp_mad->mad_hdr.tid, IB_MGMT_RMPP_STATUS_S2B); @@ -691,11 +649,11 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, goto out; /* Old ACK */ if (seg_num > mad_send_wr->last_ack) { - mad_send_wr->last_ack = seg_num; + adjust_last_ack(mad_send_wr, seg_num); mad_send_wr->retries = mad_send_wr->send_buf.retries; } mad_send_wr->newwin = newwin; - if (mad_send_wr->last_ack == mad_send_wr->total_seg) { + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { /* If no response is expected, the ACK completes the send */ if (!mad_send_wr->send_buf.timeout_ms) { struct ib_mad_send_wc wc; @@ -714,7 +672,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, mad_send_wr->send_buf.timeout_ms); } else if (mad_send_wr->refcount == 1 && mad_send_wr->seg_num < mad_send_wr->newwin && - mad_send_wr->seg_num <= mad_send_wr->total_seg) { + mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { /* Send failure will just result in a timeout/retry */ ret = send_next_seg(mad_send_wr); if (ret) @@ -838,31 +796,19 @@ out: int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_rmpp_mad *rmpp_mad; - int i, total_len, ret; + int ret; rmpp_mad = mad_send_wr->send_buf.mad; if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; - if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) + if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) { + mad_send_wr->seg_num = 1; return IB_RMPP_RESULT_INTERNAL; + } - if (mad_send_wr->send_wr.num_sge > 1) - return -EINVAL; /* TODO: support num_sge > 1 */ - - mad_send_wr->seg_num = 1; mad_send_wr->newwin = 1; - mad_send_wr->data_offset = data_offset(rmpp_mad->mad_hdr.mgmt_class); - - total_len = 0; - for (i = 0; i < mad_send_wr->send_wr.num_sge; i++) - total_len += mad_send_wr->send_wr.sg_list[i].length; - - mad_send_wr->total_seg = (total_len - mad_send_wr->data_offset) / - (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset); - mad_send_wr->pad = total_len - IB_MGMT_RMPP_HDR - - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin); /* We need to wait for the final ACK even if there isn't a response */ mad_send_wr->refcount += (mad_send_wr->timeout == 0); @@ -893,14 +839,14 @@ int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr, if (!mad_send_wr->timeout) return IB_RMPP_RESULT_PROCESSED; /* Response received */ - if (mad_send_wr->last_ack == mad_send_wr->total_seg) { + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) { mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); return IB_RMPP_RESULT_PROCESSED; /* Send done */ } - if (mad_send_wr->seg_num > mad_send_wr->newwin || - mad_send_wr->seg_num > mad_send_wr->total_seg) + if (mad_send_wr->seg_num == mad_send_wr->newwin || + mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */ ret = send_next_seg(mad_send_wr); @@ -921,10 +867,12 @@ int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr) IB_MGMT_RMPP_FLAG_ACTIVE)) return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */ - if (mad_send_wr->last_ack == mad_send_wr->total_seg) + if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) return IB_RMPP_RESULT_PROCESSED; - mad_send_wr->seg_num = mad_send_wr->last_ack + 1; + mad_send_wr->seg_num = mad_send_wr->last_ack; + mad_send_wr->cur_seg = mad_send_wr->last_ack_seg; + ret = send_next_seg(mad_send_wr); if (ret) return IB_RMPP_RESULT_PROCESSED; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index c908de8db5..fb6cd42601 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -31,7 +31,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: user_mad.c 4010 2005-11-09 23:11:56Z roland $ + * $Id: user_mad.c 5596 2006-03-03 01:00:07Z sean.hefty $ */ #include @@ -121,6 +121,7 @@ struct ib_umad_file { struct ib_umad_packet { struct ib_mad_send_buf *msg; + struct ib_mad_recv_wc *recv_wc; struct list_head list; int length; struct ib_user_mad mad; @@ -176,31 +177,32 @@ static int queue_packet(struct ib_umad_file *file, return ret; } +static int data_offset(u8 mgmt_class) +{ + if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM) + return IB_MGMT_SA_HDR; + else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) && + (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)) + return IB_MGMT_VENDOR_HDR; + else + return IB_MGMT_RMPP_HDR; +} + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *send_wc) { struct ib_umad_file *file = agent->context; - struct ib_umad_packet *timeout; struct ib_umad_packet *packet = send_wc->send_buf->context[0]; ib_destroy_ah(packet->msg->ah); ib_free_send_mad(packet->msg); if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) { - timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, GFP_KERNEL); - if (!timeout) - goto out; - - timeout->length = IB_MGMT_MAD_HDR; - timeout->mad.hdr.id = packet->mad.hdr.id; - timeout->mad.hdr.status = ETIMEDOUT; - memcpy(timeout->mad.data, packet->mad.data, - sizeof (struct ib_mad_hdr)); - - if (queue_packet(file, agent, timeout)) - kfree(timeout); + packet->length = IB_MGMT_MAD_HDR; + packet->mad.hdr.status = ETIMEDOUT; + if (!queue_packet(file, agent, packet)) + return; } -out: kfree(packet); } @@ -209,22 +211,20 @@ static void recv_handler(struct ib_mad_agent *agent, { struct ib_umad_file *file = agent->context; struct ib_umad_packet *packet; - int length; if (mad_recv_wc->wc->status != IB_WC_SUCCESS) - goto out; + goto err1; - length = mad_recv_wc->mad_len; - packet = kzalloc(sizeof *packet + length, GFP_KERNEL); + packet = kzalloc(sizeof *packet, GFP_KERNEL); if (!packet) - goto out; + goto err1; - packet->length = length; - - ib_coalesce_recv_mad(mad_recv_wc, packet->mad.data); + packet->length = mad_recv_wc->mad_len; + packet->recv_wc = mad_recv_wc; packet->mad.hdr.status = 0; - packet->mad.hdr.length = length + sizeof (struct ib_user_mad); + packet->mad.hdr.length = sizeof (struct ib_user_mad) + + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; @@ -240,12 +240,79 @@ static void recv_handler(struct ib_mad_agent *agent, } if (queue_packet(file, agent, packet)) - kfree(packet); + goto err2; + return; -out: +err2: + kfree(packet); +err1: ib_free_recv_mad(mad_recv_wc); } +static ssize_t copy_recv_mad(char __user *buf, struct ib_umad_packet *packet, + size_t count) +{ + struct ib_mad_recv_buf *recv_buf; + int left, seg_payload, offset, max_seg_payload; + + /* We need enough room to copy the first (or only) MAD segment. */ + recv_buf = &packet->recv_wc->recv_buf; + if ((packet->length <= sizeof (*recv_buf->mad) && + count < sizeof (packet->mad) + packet->length) || + (packet->length > sizeof (*recv_buf->mad) && + count < sizeof (packet->mad) + sizeof (*recv_buf->mad))) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, sizeof (packet->mad))) + return -EFAULT; + + buf += sizeof (packet->mad); + seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad)); + if (copy_to_user(buf, recv_buf->mad, seg_payload)) + return -EFAULT; + + if (seg_payload < packet->length) { + /* + * Multipacket RMPP MAD message. Copy remainder of message. + * Note that last segment may have a shorter payload. + */ + if (count < sizeof (packet->mad) + packet->length) { + /* + * The buffer is too small, return the first RMPP segment, + * which includes the RMPP message length. + */ + return -ENOSPC; + } + offset = data_offset(recv_buf->mad->mad_hdr.mgmt_class); + max_seg_payload = sizeof (struct ib_mad) - offset; + + for (left = packet->length - seg_payload, buf += seg_payload; + left; left -= seg_payload, buf += seg_payload) { + recv_buf = container_of(recv_buf->list.next, + struct ib_mad_recv_buf, list); + seg_payload = min(left, max_seg_payload); + if (copy_to_user(buf, ((void *) recv_buf->mad) + offset, + seg_payload)) + return -EFAULT; + } + } + return sizeof (packet->mad) + packet->length; +} + +static ssize_t copy_send_mad(char __user *buf, struct ib_umad_packet *packet, + size_t count) +{ + ssize_t size = sizeof (packet->mad) + packet->length; + + if (count < size) + return -EINVAL; + + if (copy_to_user(buf, &packet->mad, size)) + return -EFAULT; + + return size; +} + static ssize_t ib_umad_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { @@ -253,7 +320,7 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, struct ib_umad_packet *packet; ssize_t ret; - if (count < sizeof (struct ib_user_mad) + sizeof (struct ib_mad)) + if (count < sizeof (struct ib_user_mad)) return -EINVAL; spin_lock_irq(&file->recv_lock); @@ -276,28 +343,44 @@ static ssize_t ib_umad_read(struct file *filp, char __user *buf, spin_unlock_irq(&file->recv_lock); - if (count < packet->length + sizeof (struct ib_user_mad)) { - /* Return length needed (and first RMPP segment) if too small */ - if (copy_to_user(buf, &packet->mad, - sizeof (struct ib_user_mad) + sizeof (struct ib_mad))) - ret = -EFAULT; - else - ret = -ENOSPC; - } else if (copy_to_user(buf, &packet->mad, - packet->length + sizeof (struct ib_user_mad))) - ret = -EFAULT; + if (packet->recv_wc) + ret = copy_recv_mad(buf, packet, count); else - ret = packet->length + sizeof (struct ib_user_mad); + ret = copy_send_mad(buf, packet, count); + if (ret < 0) { /* Requeue packet */ spin_lock_irq(&file->recv_lock); list_add(&packet->list, &file->recv_list); spin_unlock_irq(&file->recv_lock); - } else + } else { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); kfree(packet); + } return ret; } +static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf) +{ + int left, seg; + + /* Copy class specific header */ + if ((msg->hdr_len > IB_MGMT_RMPP_HDR) && + copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR, + msg->hdr_len - IB_MGMT_RMPP_HDR)) + return -EFAULT; + + /* All headers are in place. Copy data segments. */ + for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0; + seg++, left -= msg->seg_size, buf += msg->seg_size) { + if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf, + min(left, msg->seg_size))) + return -EFAULT; + } + return 0; +} + static ssize_t ib_umad_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { @@ -309,14 +392,12 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, struct ib_rmpp_mad *rmpp_mad; u8 method; __be64 *tid; - int ret, length, hdr_len, copy_offset; - int rmpp_active, has_rmpp_header; + int ret, data_len, hdr_len, copy_offset, rmpp_active; if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR) return -EINVAL; - length = count - sizeof (struct ib_user_mad); - packet = kmalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); + packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL); if (!packet) return -ENOMEM; @@ -363,35 +444,25 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, if (rmpp_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_ADM) { hdr_len = IB_MGMT_SA_HDR; copy_offset = IB_MGMT_RMPP_HDR; - has_rmpp_header = 1; + rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE; } else if (rmpp_mad->mad_hdr.mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START && rmpp_mad->mad_hdr.mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END) { - hdr_len = IB_MGMT_VENDOR_HDR; - copy_offset = IB_MGMT_RMPP_HDR; - has_rmpp_header = 1; + hdr_len = IB_MGMT_VENDOR_HDR; + copy_offset = IB_MGMT_RMPP_HDR; + rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & + IB_MGMT_RMPP_FLAG_ACTIVE; } else { hdr_len = IB_MGMT_MAD_HDR; copy_offset = IB_MGMT_MAD_HDR; - has_rmpp_header = 0; - } - - if (has_rmpp_header) - rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & - IB_MGMT_RMPP_FLAG_ACTIVE; - else rmpp_active = 0; - - /* Validate that the management class can support RMPP */ - if (rmpp_active && !agent->rmpp_version) { - ret = -EINVAL; - goto err_ah; } + data_len = count - sizeof (struct ib_user_mad) - hdr_len; packet->msg = ib_create_send_mad(agent, be32_to_cpu(packet->mad.hdr.qpn), - 0, rmpp_active, - hdr_len, length - hdr_len, - GFP_KERNEL); + 0, rmpp_active, hdr_len, + data_len, GFP_KERNEL); if (IS_ERR(packet->msg)) { ret = PTR_ERR(packet->msg); goto err_ah; @@ -402,14 +473,21 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, packet->msg->retries = packet->mad.hdr.retries; packet->msg->context[0] = packet; - /* Copy MAD headers (RMPP header in place) */ + /* Copy MAD header. Any RMPP header is already in place. */ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR); - /* Now, copy rest of message from user into send buffer */ - if (copy_from_user(packet->msg->mad + copy_offset, - buf + sizeof (struct ib_user_mad) + copy_offset, - length - copy_offset)) { - ret = -EFAULT; - goto err_msg; + buf += sizeof (struct ib_user_mad); + + if (!rmpp_active) { + if (copy_from_user(packet->msg->mad + copy_offset, + buf + copy_offset, + hdr_len + data_len - copy_offset)) { + ret = -EFAULT; + goto err_msg; + } + } else { + ret = copy_rmpp_mad(packet->msg, buf); + if (ret) + goto err_msg; } /* @@ -433,18 +511,14 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, goto err_msg; up_read(&file->port->mutex); - return count; err_msg: ib_free_send_mad(packet->msg); - err_ah: ib_destroy_ah(ah); - err_up: up_read(&file->port->mutex); - err: kfree(packet); return ret; @@ -627,8 +701,11 @@ static int ib_umad_close(struct inode *inode, struct file *filp) already_dead = file->agents_dead; file->agents_dead = 1; - list_for_each_entry_safe(packet, tmp, &file->recv_list, list) + list_for_each_entry_safe(packet, tmp, &file->recv_list, list) { + if (packet->recv_wc) + ib_free_recv_mad(packet->recv_wc); kfree(packet); + } list_del(&file->port_list); diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 2c13350674..51ab8eddb2 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -33,7 +33,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * - * $Id: ib_mad.h 2775 2005-07-02 13:42:12Z halr $ + * $Id: ib_mad.h 5596 2006-03-03 01:00:07Z sean.hefty $ */ #if !defined( IB_MAD_H ) @@ -208,15 +208,23 @@ struct ib_class_port_info /** * ib_mad_send_buf - MAD data buffer and work request for sends. * @next: A pointer used to chain together MADs for posting. - * @mad: References an allocated MAD data buffer. + * @mad: References an allocated MAD data buffer for MADs that do not have + * RMPP active. For MADs using RMPP, references the common and management + * class specific headers. * @mad_agent: MAD agent that allocated the buffer. * @ah: The address handle to use when sending the MAD. * @context: User-controlled context fields. + * @hdr_len: Indicates the size of the data header of the MAD. This length + * includes the common MAD, RMPP, and class specific headers. + * @data_len: Indicates the total size of user-transferred data. + * @seg_count: The number of RMPP segments allocated for this send. + * @seg_size: Size of each RMPP segment. * @timeout_ms: Time to wait for a response. * @retries: Number of times to retry a request for a response. * * Users are responsible for initializing the MAD buffer itself, with the - * exception of specifying the payload length field in any RMPP MAD. + * exception of any RMPP header. Additional segment buffer space allocated + * beyond data_len is padding. */ struct ib_mad_send_buf { struct ib_mad_send_buf *next; @@ -224,6 +232,10 @@ struct ib_mad_send_buf { struct ib_mad_agent *mad_agent; struct ib_ah *ah; void *context[2]; + int hdr_len; + int data_len; + int seg_count; + int seg_size; int timeout_ms; int retries; }; @@ -299,7 +311,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to - * the user after the send operation completes. All data buffers given + * the user before the send operation completes. All data buffers given * to registered agents through this routine are owned by the receiving * client, except for snooping agents. Clients snooping MADs should not * modify the data referenced by @mad_recv_wc. @@ -485,17 +497,6 @@ int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); int ib_post_send_mad(struct ib_mad_send_buf *send_buf, struct ib_mad_send_buf **bad_send_buf); -/** - * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer. - * @mad_recv_wc: Work completion information for a received MAD. - * @buf: User-provided data buffer to receive the coalesced buffers. The - * referenced buffer should be at least the size of the mad_len specified - * by @mad_recv_wc. - * - * This call copies a chain of received MAD segments into a single data buffer, - * removing duplicated headers. - */ -void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf); /** * ib_free_recv_mad - Returns data buffers used to receive a MAD. @@ -590,9 +591,10 @@ int ib_process_mad_wc(struct ib_mad_agent *mad_agent, * with an initialized work request structure. Users may modify the returned * MAD data buffer before posting the send. * - * The returned data buffer will be cleared. Users are responsible for - * initializing the common MAD and any class specific headers. If @rmpp_active - * is set, the RMPP header will be initialized for sending. + * The returned MAD header, class specific headers, and any padding will be + * cleared. Users are responsible for initializing the common MAD header, + * any class specific header, and MAD data area. + * If @rmpp_active is set, the RMPP header will be initialized for sending. */ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, @@ -600,6 +602,16 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, int hdr_len, int data_len, gfp_t gfp_mask); +/** + * ib_get_rmpp_segment - returns the data buffer for a given RMPP segment. + * @send_buf: Previously allocated send data buffer. + * @seg_num: number of segment to return + * + * This routine returns a pointer to the data buffer of an RMPP MAD. + * Users must provide synchronization to @send_buf around this call. + */ +void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); + /** * ib_free_send_mad - Returns data buffers used to send a MAD. * @send_buf: Previously allocated send data buffer.