xhci: TD-fragment, align the unsplittable case with a bounce buffer
authorMathias Nyman <mathias.nyman@linux.intel.com>
Tue, 21 Jun 2016 07:58:02 +0000 (10:58 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 26 Jun 2016 18:43:39 +0000 (11:43 -0700)
If the last trb before a link is not packet size aligned, and is not
splittable then use a bounce buffer for that chunk of max packet size
unalignable data.

Allocate a max packet size bounce buffer for every segment of a bulk
endpoint ring at the same time as allocating the ring.
If we need to align the data before the link trb in that segment then
copy the data to the segment bounce buffer, dma map it, and enqueue it.
Once the td finishes, or is cancelled, unmap it.

For in transfers we need to first map the bounce buffer, then queue it,
after it finishes, copy the bounce buffer to the original sg list, and
finally unmap it

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/usb/host/xhci-mem.c
drivers/usb/host/xhci-ring.c
drivers/usb/host/xhci.c
drivers/usb/host/xhci.h

index bad0d1f9a41d4ff9edbcb186eb86238abf878c74..6afe32381209d76cd0cf2f46d686a9f07a43f2c9 100644 (file)
@@ -37,7 +37,9 @@
  * "All components of all Command and Transfer TRBs shall be initialized to '0'"
  */
 static struct xhci_segment *xhci_segment_alloc(struct xhci_hcd *xhci,
-                                       unsigned int cycle_state, gfp_t flags)
+                                              unsigned int cycle_state,
+                                              unsigned int max_packet,
+                                              gfp_t flags)
 {
        struct xhci_segment *seg;
        dma_addr_t      dma;
@@ -53,6 +55,14 @@ static struct xhci_segment *xhci_segment_alloc(struct xhci_hcd *xhci,
                return NULL;
        }
 
+       if (max_packet) {
+               seg->bounce_buf = kzalloc(max_packet, flags | GFP_DMA);
+               if (!seg->bounce_buf) {
+                       dma_pool_free(xhci->segment_pool, seg->trbs, dma);
+                       kfree(seg);
+                       return NULL;
+               }
+       }
        /* If the cycle state is 0, set the cycle bit to 1 for all the TRBs */
        if (cycle_state == 0) {
                for (i = 0; i < TRBS_PER_SEGMENT; i++)
@@ -70,6 +80,7 @@ static void xhci_segment_free(struct xhci_hcd *xhci, struct xhci_segment *seg)
                dma_pool_free(xhci->segment_pool, seg->trbs, seg->dma);
                seg->trbs = NULL;
        }
+       kfree(seg->bounce_buf);
        kfree(seg);
 }
 
@@ -317,11 +328,11 @@ static void xhci_initialize_ring_info(struct xhci_ring *ring,
 static int xhci_alloc_segments_for_ring(struct xhci_hcd *xhci,
                struct xhci_segment **first, struct xhci_segment **last,
                unsigned int num_segs, unsigned int cycle_state,
-               enum xhci_ring_type type, gfp_t flags)
+               enum xhci_ring_type type, unsigned int max_packet, gfp_t flags)
 {
        struct xhci_segment *prev;
 
-       prev = xhci_segment_alloc(xhci, cycle_state, flags);
+       prev = xhci_segment_alloc(xhci, cycle_state, max_packet, flags);
        if (!prev)
                return -ENOMEM;
        num_segs--;
@@ -330,7 +341,7 @@ static int xhci_alloc_segments_for_ring(struct xhci_hcd *xhci,
        while (num_segs > 0) {
                struct xhci_segment     *next;
 
-               next = xhci_segment_alloc(xhci, cycle_state, flags);
+               next = xhci_segment_alloc(xhci, cycle_state, max_packet, flags);
                if (!next) {
                        prev = *first;
                        while (prev) {
@@ -360,7 +371,7 @@ static int xhci_alloc_segments_for_ring(struct xhci_hcd *xhci,
  */
 static struct xhci_ring *xhci_ring_alloc(struct xhci_hcd *xhci,
                unsigned int num_segs, unsigned int cycle_state,
-               enum xhci_ring_type type, gfp_t flags)
+               enum xhci_ring_type type, unsigned int max_packet, gfp_t flags)
 {
        struct xhci_ring        *ring;
        int ret;
@@ -370,13 +381,15 @@ static struct xhci_ring *xhci_ring_alloc(struct xhci_hcd *xhci,
                return NULL;
 
        ring->num_segs = num_segs;
+       ring->bounce_buf_len = max_packet;
        INIT_LIST_HEAD(&ring->td_list);
        ring->type = type;
        if (num_segs == 0)
                return ring;
 
        ret = xhci_alloc_segments_for_ring(xhci, &ring->first_seg,
-                       &ring->last_seg, num_segs, cycle_state, type, flags);
+                       &ring->last_seg, num_segs, cycle_state, type,
+                       max_packet, flags);
        if (ret)
                goto fail;
 
@@ -470,7 +483,8 @@ int xhci_ring_expansion(struct xhci_hcd *xhci, struct xhci_ring *ring,
                        ring->num_segs : num_segs_needed;
 
        ret = xhci_alloc_segments_for_ring(xhci, &first, &last,
-                       num_segs, ring->cycle_state, ring->type, flags);
+                       num_segs, ring->cycle_state, ring->type,
+                       ring->bounce_buf_len, flags);
        if (ret)
                return -ENOMEM;
 
@@ -652,7 +666,8 @@ struct xhci_ring *xhci_stream_id_to_ring(
  */
 struct xhci_stream_info *xhci_alloc_stream_info(struct xhci_hcd *xhci,
                unsigned int num_stream_ctxs,
-               unsigned int num_streams, gfp_t mem_flags)
+               unsigned int num_streams,
+               unsigned int max_packet, gfp_t mem_flags)
 {
        struct xhci_stream_info *stream_info;
        u32 cur_stream;
@@ -704,9 +719,11 @@ struct xhci_stream_info *xhci_alloc_stream_info(struct xhci_hcd *xhci,
         * and add their segment DMA addresses to the radix tree.
         * Stream 0 is reserved.
         */
+
        for (cur_stream = 1; cur_stream < num_streams; cur_stream++) {
                stream_info->stream_rings[cur_stream] =
-                       xhci_ring_alloc(xhci, 2, 1, TYPE_STREAM, mem_flags);
+                       xhci_ring_alloc(xhci, 2, 1, TYPE_STREAM, max_packet,
+                                       mem_flags);
                cur_ring = stream_info->stream_rings[cur_stream];
                if (!cur_ring)
                        goto cleanup_rings;
@@ -1003,7 +1020,7 @@ int xhci_alloc_virt_device(struct xhci_hcd *xhci, int slot_id,
        }
 
        /* Allocate endpoint 0 ring */
-       dev->eps[0].ring = xhci_ring_alloc(xhci, 2, 1, TYPE_CTRL, flags);
+       dev->eps[0].ring = xhci_ring_alloc(xhci, 2, 1, TYPE_CTRL, 0, flags);
        if (!dev->eps[0].ring)
                goto fail;
 
@@ -1434,22 +1451,6 @@ int xhci_endpoint_init(struct xhci_hcd *xhci,
                return -EINVAL;
 
        ring_type = usb_endpoint_type(&ep->desc);
-       /* Set up the endpoint ring */
-       virt_dev->eps[ep_index].new_ring =
-               xhci_ring_alloc(xhci, 2, 1, ring_type, mem_flags);
-       if (!virt_dev->eps[ep_index].new_ring) {
-               /* Attempt to use the ring cache */
-               if (virt_dev->num_rings_cached == 0)
-                       return -ENOMEM;
-               virt_dev->num_rings_cached--;
-               virt_dev->eps[ep_index].new_ring =
-                       virt_dev->ring_cache[virt_dev->num_rings_cached];
-               virt_dev->ring_cache[virt_dev->num_rings_cached] = NULL;
-               xhci_reinit_cached_ring(xhci, virt_dev->eps[ep_index].new_ring,
-                                       1, ring_type);
-       }
-       virt_dev->eps[ep_index].skip = false;
-       ep_ring = virt_dev->eps[ep_index].new_ring;
 
        /*
         * Get values to fill the endpoint context, mostly from ep descriptor.
@@ -1479,6 +1480,23 @@ int xhci_endpoint_init(struct xhci_hcd *xhci,
        if ((xhci->hci_version > 0x100) && HCC2_LEC(xhci->hcc_params2))
                mult = 0;
 
+       /* Set up the endpoint ring */
+       virt_dev->eps[ep_index].new_ring =
+               xhci_ring_alloc(xhci, 2, 1, ring_type, max_packet, mem_flags);
+       if (!virt_dev->eps[ep_index].new_ring) {
+               /* Attempt to use the ring cache */
+               if (virt_dev->num_rings_cached == 0)
+                       return -ENOMEM;
+               virt_dev->num_rings_cached--;
+               virt_dev->eps[ep_index].new_ring =
+                       virt_dev->ring_cache[virt_dev->num_rings_cached];
+               virt_dev->ring_cache[virt_dev->num_rings_cached] = NULL;
+               xhci_reinit_cached_ring(xhci, virt_dev->eps[ep_index].new_ring,
+                                       1, ring_type);
+       }
+       virt_dev->eps[ep_index].skip = false;
+       ep_ring = virt_dev->eps[ep_index].new_ring;
+
        /* Fill the endpoint context */
        ep_ctx->ep_info = cpu_to_le32(EP_MAX_ESIT_PAYLOAD_HI(max_esit_payload) |
                                      EP_INTERVAL(interval) |
@@ -2409,7 +2427,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
                goto fail;
 
        /* Set up the command ring to have one segments for now. */
-       xhci->cmd_ring = xhci_ring_alloc(xhci, 1, 1, TYPE_COMMAND, flags);
+       xhci->cmd_ring = xhci_ring_alloc(xhci, 1, 1, TYPE_COMMAND, 0, flags);
        if (!xhci->cmd_ring)
                goto fail;
        xhci_dbg_trace(xhci, trace_xhci_dbg_init,
@@ -2454,7 +2472,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags)
         */
        xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Allocating event ring");
        xhci->event_ring = xhci_ring_alloc(xhci, ERST_NUM_SEGS, 1, TYPE_EVENT,
-                                               flags);
+                                       0, flags);
        if (!xhci->event_ring)
                goto fail;
        if (xhci_check_trb_in_td_math(xhci) < 0)
index 81bb8bfdb7e60af80df467ad25606dcdec7eb30a..b2c861eeeece7c25011bfa0efafdc635a323406b 100644 (file)
@@ -66,6 +66,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 #include "xhci.h"
 #include "xhci-trace.h"
 #include "xhci-mtk.h"
@@ -626,6 +627,31 @@ static void xhci_giveback_urb_in_irq(struct xhci_hcd *xhci,
        }
 }
 
+void xhci_unmap_td_bounce_buffer(struct xhci_hcd *xhci, struct xhci_ring *ring,
+                                struct xhci_td *td)
+{
+       struct device *dev = xhci_to_hcd(xhci)->self.controller;
+       struct xhci_segment *seg = td->bounce_seg;
+       struct urb *urb = td->urb;
+
+       if (!seg || !urb)
+               return;
+
+       if (usb_urb_dir_out(urb)) {
+               dma_unmap_single(dev, seg->bounce_dma, ring->bounce_buf_len,
+                                DMA_TO_DEVICE);
+               return;
+       }
+
+       /* for in tranfers we need to copy the data from bounce to sg */
+       sg_pcopy_from_buffer(urb->sg, urb->num_mapped_sgs, seg->bounce_buf,
+                            seg->bounce_len, seg->bounce_offs);
+       dma_unmap_single(dev, seg->bounce_dma, ring->bounce_buf_len,
+                        DMA_FROM_DEVICE);
+       seg->bounce_len = 0;
+       seg->bounce_offs = 0;
+}
+
 /*
  * When we get a command completion for a Stop Endpoint Command, we need to
  * unlink any cancelled TDs from the ring.  There are two ways to do that:
@@ -745,6 +771,8 @@ remove_finished_td:
                /* Doesn't matter what we pass for status, since the core will
                 * just overwrite it (because the URB has been unlinked).
                 */
+               if (ep_ring && cur_td->bounce_seg)
+                       xhci_unmap_td_bounce_buffer(xhci, ep_ring, cur_td);
                xhci_giveback_urb_in_irq(xhci, cur_td, 0);
 
                /* Stop processing the cancelled list if the watchdog timer is
@@ -767,6 +795,9 @@ static void xhci_kill_ring_urbs(struct xhci_hcd *xhci, struct xhci_ring *ring)
                list_del_init(&cur_td->td_list);
                if (!list_empty(&cur_td->cancelled_td_list))
                        list_del_init(&cur_td->cancelled_td_list);
+
+               if (cur_td->bounce_seg)
+                       xhci_unmap_td_bounce_buffer(xhci, ring, cur_td);
                xhci_giveback_urb_in_irq(xhci, cur_td, -ESHUTDOWN);
        }
 }
@@ -1865,6 +1896,10 @@ td_cleanup:
        urb = td->urb;
        urb_priv = urb->hcpriv;
 
+       /* if a bounce buffer was used to align this td then unmap it */
+       if (td->bounce_seg)
+               xhci_unmap_td_bounce_buffer(xhci, ep_ring, td);
+
        /* Do one last check of the actual transfer length.
         * If the host controller said we transferred more data than the buffer
         * length, urb->actual_length will be a very big number (since it's
@@ -3116,11 +3151,14 @@ static u32 xhci_td_remainder(struct xhci_hcd *xhci, int transferred,
        return (total_packet_count - ((transferred + trb_buff_len) / maxp));
 }
 
+
 static int xhci_align_td(struct xhci_hcd *xhci, struct urb *urb, u32 enqd_len,
-                        u32 *trb_buff_len)
+                        u32 *trb_buff_len, struct xhci_segment *seg)
 {
+       struct device *dev = xhci_to_hcd(xhci)->self.controller;
        unsigned int unalign;
        unsigned int max_pkt;
+       u32 new_buff_len;
 
        max_pkt = GET_MAX_PACKET(usb_endpoint_maxp(&urb->ep->desc));
        unalign = (enqd_len + *trb_buff_len) % max_pkt;
@@ -3129,11 +3167,48 @@ static int xhci_align_td(struct xhci_hcd *xhci, struct urb *urb, u32 enqd_len,
        if (unalign == 0)
                return 0;
 
+       xhci_dbg(xhci, "Unaligned %d bytes, buff len %d\n",
+                unalign, *trb_buff_len);
+
        /* is the last nornal TRB alignable by splitting it */
        if (*trb_buff_len > unalign) {
                *trb_buff_len -= unalign;
+               xhci_dbg(xhci, "split align, new buff len %d\n", *trb_buff_len);
                return 0;
        }
+
+       /*
+        * We want enqd_len + trb_buff_len to sum up to a number aligned to
+        * number which is divisible by the endpoint's wMaxPacketSize. IOW:
+        * (size of currently enqueued TRBs + remainder) % wMaxPacketSize == 0.
+        */
+       new_buff_len = max_pkt - (enqd_len % max_pkt);
+
+       if (new_buff_len > (urb->transfer_buffer_length - enqd_len))
+               new_buff_len = (urb->transfer_buffer_length - enqd_len);
+
+       /* create a max max_pkt sized bounce buffer pointed to by last trb */
+       if (usb_urb_dir_out(urb)) {
+               sg_pcopy_to_buffer(urb->sg, urb->num_mapped_sgs,
+                                  seg->bounce_buf, new_buff_len, enqd_len);
+               seg->bounce_dma = dma_map_single(dev, seg->bounce_buf,
+                                                max_pkt, DMA_TO_DEVICE);
+       } else {
+               seg->bounce_dma = dma_map_single(dev, seg->bounce_buf,
+                                                max_pkt, DMA_FROM_DEVICE);
+       }
+
+       if (dma_mapping_error(dev, seg->bounce_dma)) {
+               /* try without aligning. Some host controllers survive */
+               xhci_warn(xhci, "Failed mapping bounce buffer, not aligning\n");
+               return 0;
+       }
+       *trb_buff_len = new_buff_len;
+       seg->bounce_len = new_buff_len;
+       seg->bounce_offs = enqd_len;
+
+       xhci_dbg(xhci, "Bounce align, new buff len %d\n", *trb_buff_len);
+
        return 1;
 }
 
@@ -3152,9 +3227,9 @@ int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
        unsigned int num_trbs;
        unsigned int start_cycle, num_sgs = 0;
        unsigned int enqd_len, block_len, trb_buff_len, full_len;
-       int ret;
+       int sent_len, ret;
        u32 field, length_field, remainder;
-       u64 addr;
+       u64 addr, send_addr;
 
        ring = xhci_urb_to_transfer_ring(xhci, urb);
        if (!ring)
@@ -3194,6 +3269,7 @@ int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
         */
        start_trb = &ring->enqueue->generic;
        start_cycle = ring->cycle_state;
+       send_addr = addr;
 
        /* Queue the TRBs, even if they are zero-length */
        for (enqd_len = 0; enqd_len < full_len; enqd_len += trb_buff_len) {
@@ -3222,10 +3298,16 @@ int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
                        if (last_trb(xhci, ring, ring->enq_seg,
                                     ring->enqueue + 1)) {
                                if (xhci_align_td(xhci, urb, enqd_len,
-                                                &trb_buff_len))
-                                       xhci_dbg(xhci, "TRB align fail\n");
+                                                 &trb_buff_len,
+                                                 ring->enq_seg)) {
+                                       send_addr = ring->enq_seg->bounce_dma;
+                                       /* assuming TD won't span 2 segs */
+                                       td->bounce_seg = ring->enq_seg;
+                               }
                        }
-               } else {
+               }
+               if (enqd_len + trb_buff_len >= full_len) {
+                       field &= ~TRB_CHAIN;
                        field |= TRB_IOC;
                        more_trbs_coming = false;
                        td->last_trb = ring->enqueue;
@@ -3244,23 +3326,27 @@ int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
                        TRB_INTR_TARGET(0);
 
                queue_trb(xhci, ring, more_trbs_coming | need_zero_pkt,
-                               lower_32_bits(addr),
-                               upper_32_bits(addr),
+                               lower_32_bits(send_addr),
+                               upper_32_bits(send_addr),
                                length_field,
                                field);
 
                addr += trb_buff_len;
-               block_len -= trb_buff_len;
+               sent_len = trb_buff_len;
 
-               if (sg && block_len == 0) {
+               while (sg && sent_len >= block_len) {
                        /* New sg entry */
                        --num_sgs;
+                       sent_len -= block_len;
                        if (num_sgs != 0) {
                                sg = sg_next(sg);
                                block_len = sg_dma_len(sg);
                                addr = (u64) sg_dma_address(sg);
+                               addr += sent_len;
                        }
                }
+               block_len -= sent_len;
+               send_addr = addr;
        }
 
        if (need_zero_pkt) {
index f2f9518c53ab437548133349101983389c8de78b..9da98321d8e6fe393a3f4d995f7c6e94987107a7 100644 (file)
@@ -3139,6 +3139,7 @@ int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev,
        struct xhci_input_control_ctx *ctrl_ctx;
        unsigned int ep_index;
        unsigned int num_stream_ctxs;
+       unsigned int max_packet;
        unsigned long flags;
        u32 changed_ep_bitmask = 0;
 
@@ -3212,9 +3213,11 @@ int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev,
 
        for (i = 0; i < num_eps; i++) {
                ep_index = xhci_get_endpoint_index(&eps[i]->desc);
+               max_packet = GET_MAX_PACKET(usb_endpoint_maxp(&eps[i]->desc));
                vdev->eps[ep_index].stream_info = xhci_alloc_stream_info(xhci,
                                num_stream_ctxs,
-                               num_streams, mem_flags);
+                               num_streams,
+                               max_packet, mem_flags);
                if (!vdev->eps[ep_index].stream_info)
                        goto cleanup;
                /* Set maxPstreams in endpoint context and update deq ptr to
index b0b8d0f8791ae160df627f84028e0b9bf1f4d408..b2c1dc5dc0f30f17fa2f4354537c193958ab5f78 100644 (file)
@@ -1347,6 +1347,11 @@ struct xhci_segment {
        /* private to HCD */
        struct xhci_segment     *next;
        dma_addr_t              dma;
+       /* Max packet sized bounce buffer for td-fragmant alignment */
+       dma_addr_t              bounce_dma;
+       void                    *bounce_buf;
+       unsigned int            bounce_offs;
+       unsigned int            bounce_len;
 };
 
 struct xhci_td {
@@ -1356,6 +1361,7 @@ struct xhci_td {
        struct xhci_segment     *start_seg;
        union xhci_trb          *first_trb;
        union xhci_trb          *last_trb;
+       struct xhci_segment     *bounce_seg;
        /* actual_length of the URB has already been set */
        bool                    urb_length_set;
 };
@@ -1405,6 +1411,7 @@ struct xhci_ring {
        unsigned int            num_segs;
        unsigned int            num_trbs_free;
        unsigned int            num_trbs_free_temp;
+       unsigned int            bounce_buf_len;
        enum xhci_ring_type     type;
        bool                    last_td_was_short;
        struct radix_tree_root  *trb_address_map;
@@ -1807,7 +1814,8 @@ void xhci_free_or_cache_endpoint_ring(struct xhci_hcd *xhci,
                unsigned int ep_index);
 struct xhci_stream_info *xhci_alloc_stream_info(struct xhci_hcd *xhci,
                unsigned int num_stream_ctxs,
-               unsigned int num_streams, gfp_t flags);
+               unsigned int num_streams,
+               unsigned int max_packet, gfp_t flags);
 void xhci_free_stream_info(struct xhci_hcd *xhci,
                struct xhci_stream_info *stream_info);
 void xhci_setup_streams_ep_input_ctx(struct xhci_hcd *xhci,