MAINTAINERS: omap: fix regex
[safe/jmp/linux-2.6] / drivers / usb / host / xhci-ring.c
index f04162a..aa88a06 100644 (file)
  *   endpoint rings; it generates events on the event ring for these.
  */
 
+#include <linux/scatterlist.h>
 #include "xhci.h"
 
 /*
  * Returns zero if the TRB isn't in this segment, otherwise it returns the DMA
  * address of the TRB.
  */
-dma_addr_t trb_virt_to_dma(struct xhci_segment *seg,
+dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg,
                union xhci_trb *trb)
 {
-       unsigned int offset;
+       unsigned long segment_offset;
 
-       if (!seg || !trb || (void *) trb < (void *) seg->trbs)
+       if (!seg || !trb || trb < seg->trbs)
                return 0;
-       /* offset in bytes, since these are byte-addressable */
-       offset = (unsigned int) trb - (unsigned int) seg->trbs;
-       /* SEGMENT_SIZE in bytes, trbs are 16-byte aligned */
-       if (offset > SEGMENT_SIZE || (offset % sizeof(*trb)) != 0)
+       /* offset in TRBs */
+       segment_offset = trb - seg->trbs;
+       if (segment_offset > TRBS_PER_SEGMENT)
                return 0;
-       return seg->dma + offset;
+       return seg->dma + (segment_offset * sizeof(*trb));
 }
 
 /* Does this link TRB point to the first segment in a ring,
@@ -111,6 +111,23 @@ static inline int last_trb(struct xhci_hcd *xhci, struct xhci_ring *ring,
                return (trb->link.control & TRB_TYPE_BITMASK) == TRB_TYPE(TRB_LINK);
 }
 
+/* Updates trb to point to the next TRB in the ring, and updates seg if the next
+ * TRB is in a new segment.  This does not skip over link TRBs, and it does not
+ * effect the ring dequeue or enqueue pointers.
+ */
+static void next_trb(struct xhci_hcd *xhci,
+               struct xhci_ring *ring,
+               struct xhci_segment **seg,
+               union xhci_trb **trb)
+{
+       if (last_trb(xhci, ring, *seg, *trb)) {
+               *seg = (*seg)->next;
+               *trb = ((*seg)->trbs);
+       } else {
+               *trb = (*trb)++;
+       }
+}
+
 /*
  * See Cycle bit rules. SW is the consumer for the event ring only.
  * Don't make a ring full of link TRBs.  That would be dumb and this would loop.
@@ -118,6 +135,7 @@ static inline int last_trb(struct xhci_hcd *xhci, struct xhci_ring *ring,
 static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer)
 {
        union xhci_trb *next = ++(ring->dequeue);
+       unsigned long long addr;
 
        ring->deq_updates++;
        /* Update the dequeue pointer further if that was a link TRB or we're at
@@ -127,14 +145,21 @@ static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
                if (consumer && last_trb_on_last_seg(xhci, ring, ring->deq_seg, next)) {
                        ring->cycle_state = (ring->cycle_state ? 0 : 1);
                        if (!in_interrupt())
-                               xhci_dbg(xhci, "Toggle cycle state for ring 0x%x = %i\n",
-                                               (unsigned int) ring,
+                               xhci_dbg(xhci, "Toggle cycle state for ring %p = %i\n",
+                                               ring,
                                                (unsigned int) ring->cycle_state);
                }
                ring->deq_seg = ring->deq_seg->next;
                ring->dequeue = ring->deq_seg->trbs;
                next = ring->dequeue;
        }
+       addr = (unsigned long long) xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue);
+       if (ring == xhci->event_ring)
+               xhci_dbg(xhci, "Event ring deq = 0x%llx (DMA)\n", addr);
+       else if (ring == xhci->cmd_ring)
+               xhci_dbg(xhci, "Command ring deq = 0x%llx (DMA)\n", addr);
+       else
+               xhci_dbg(xhci, "Ring deq = 0x%llx (DMA)\n", addr);
 }
 
 /*
@@ -154,6 +179,7 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
 {
        u32 chain;
        union xhci_trb *next;
+       unsigned long long addr;
 
        chain = ring->enqueue->generic.field[3] & TRB_CHAIN;
        next = ++(ring->enqueue);
@@ -165,20 +191,21 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
        while (last_trb(xhci, ring, ring->enq_seg, next)) {
                if (!consumer) {
                        if (ring != xhci->event_ring) {
+                               next->link.control &= ~TRB_CHAIN;
+                               next->link.control |= chain;
                                /* Give this link TRB to the hardware */
+                               wmb();
                                if (next->link.control & TRB_CYCLE)
                                        next->link.control &= (u32) ~TRB_CYCLE;
                                else
                                        next->link.control |= (u32) TRB_CYCLE;
-                               next->link.control &= TRB_CHAIN;
-                               next->link.control |= chain;
                        }
                        /* Toggle the cycle bit after the last ring segment. */
                        if (last_trb_on_last_seg(xhci, ring, ring->enq_seg, next)) {
                                ring->cycle_state = (ring->cycle_state ? 0 : 1);
                                if (!in_interrupt())
-                                       xhci_dbg(xhci, "Toggle cycle state for ring 0x%x = %i\n",
-                                                       (unsigned int) ring,
+                                       xhci_dbg(xhci, "Toggle cycle state for ring %p = %i\n",
+                                                       ring,
                                                        (unsigned int) ring->cycle_state);
                        }
                }
@@ -186,6 +213,13 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring, bool consumer
                ring->enqueue = ring->enq_seg->trbs;
                next = ring->enqueue;
        }
+       addr = (unsigned long long) xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue);
+       if (ring == xhci->event_ring)
+               xhci_dbg(xhci, "Event ring enq = 0x%llx (DMA)\n", addr);
+       else if (ring == xhci->cmd_ring)
+               xhci_dbg(xhci, "Command ring enq = 0x%llx (DMA)\n", addr);
+       else
+               xhci_dbg(xhci, "Ring enq = 0x%llx (DMA)\n", addr);
 }
 
 /*
@@ -217,28 +251,30 @@ static int room_on_ring(struct xhci_hcd *xhci, struct xhci_ring *ring,
        return 1;
 }
 
-void set_hc_event_deq(struct xhci_hcd *xhci)
+void xhci_set_hc_event_deq(struct xhci_hcd *xhci)
 {
-       u32 temp;
+       u64 temp;
        dma_addr_t deq;
 
-       deq = trb_virt_to_dma(xhci->event_ring->deq_seg,
+       deq = xhci_trb_virt_to_dma(xhci->event_ring->deq_seg,
                        xhci->event_ring->dequeue);
        if (deq == 0 && !in_interrupt())
                xhci_warn(xhci, "WARN something wrong with SW event ring "
                                "dequeue ptr.\n");
        /* Update HC event ring dequeue pointer */
-       temp = xhci_readl(xhci, &xhci->ir_set->erst_dequeue[0]);
+       temp = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue);
        temp &= ERST_PTR_MASK;
-       if (!in_interrupt())
-               xhci_dbg(xhci, "// Write event ring dequeue pointer\n");
-       xhci_writel(xhci, 0, &xhci->ir_set->erst_dequeue[1]);
-       xhci_writel(xhci, (deq & ~ERST_PTR_MASK) | temp,
-                       &xhci->ir_set->erst_dequeue[0]);
+       /* Don't clear the EHB bit (which is RW1C) because
+        * there might be more events to service.
+        */
+       temp &= ~ERST_EHB;
+       xhci_dbg(xhci, "// Write event ring dequeue pointer, preserving EHB bit\n");
+       xhci_write_64(xhci, ((u64) deq & (u64) ~ERST_PTR_MASK) | temp,
+                       &xhci->ir_set->erst_dequeue);
 }
 
 /* Ring the host controller doorbell after placing a command on the ring */
-void ring_cmd_db(struct xhci_hcd *xhci)
+void xhci_ring_cmd_db(struct xhci_hcd *xhci)
 {
        u32 temp;
 
@@ -249,6 +285,379 @@ void ring_cmd_db(struct xhci_hcd *xhci)
        xhci_readl(xhci, &xhci->dba->doorbell[0]);
 }
 
+static void ring_ep_doorbell(struct xhci_hcd *xhci,
+               unsigned int slot_id,
+               unsigned int ep_index)
+{
+       struct xhci_ring *ep_ring;
+       u32 field;
+       __u32 __iomem *db_addr = &xhci->dba->doorbell[slot_id];
+
+       ep_ring = xhci->devs[slot_id]->ep_rings[ep_index];
+       /* Don't ring the doorbell for this endpoint if there are pending
+        * cancellations because the we don't want to interrupt processing.
+        */
+       if (!ep_ring->cancels_pending && !(ep_ring->state & SET_DEQ_PENDING)
+                       && !(ep_ring->state & EP_HALTED)) {
+               field = xhci_readl(xhci, db_addr) & DB_MASK;
+               xhci_writel(xhci, field | EPI_TO_DB(ep_index), db_addr);
+               /* Flush PCI posted writes - FIXME Matthew Wilcox says this
+                * isn't time-critical and we shouldn't make the CPU wait for
+                * the flush.
+                */
+               xhci_readl(xhci, db_addr);
+       }
+}
+
+/*
+ * Find the segment that trb is in.  Start searching in start_seg.
+ * If we must move past a segment that has a link TRB with a toggle cycle state
+ * bit set, then we will toggle the value pointed at by cycle_state.
+ */
+static struct xhci_segment *find_trb_seg(
+               struct xhci_segment *start_seg,
+               union xhci_trb  *trb, int *cycle_state)
+{
+       struct xhci_segment *cur_seg = start_seg;
+       struct xhci_generic_trb *generic_trb;
+
+       while (cur_seg->trbs > trb ||
+                       &cur_seg->trbs[TRBS_PER_SEGMENT - 1] < trb) {
+               generic_trb = &cur_seg->trbs[TRBS_PER_SEGMENT - 1].generic;
+               if (TRB_TYPE(generic_trb->field[3]) == TRB_LINK &&
+                               (generic_trb->field[3] & LINK_TOGGLE))
+                       *cycle_state = ~(*cycle_state) & 0x1;
+               cur_seg = cur_seg->next;
+               if (cur_seg == start_seg)
+                       /* Looped over the entire list.  Oops! */
+                       return 0;
+       }
+       return cur_seg;
+}
+
+/*
+ * Move the xHC's endpoint ring dequeue pointer past cur_td.
+ * Record the new state of the xHC's endpoint ring dequeue segment,
+ * dequeue pointer, and new consumer cycle state in state.
+ * Update our internal representation of the ring's dequeue pointer.
+ *
+ * We do this in three jumps:
+ *  - First we update our new ring state to be the same as when the xHC stopped.
+ *  - Then we traverse the ring to find the segment that contains
+ *    the last TRB in the TD.  We toggle the xHC's new cycle state when we pass
+ *    any link TRBs with the toggle cycle bit set.
+ *  - Finally we move the dequeue state one TRB further, toggling the cycle bit
+ *    if we've moved it past a link TRB with the toggle cycle bit set.
+ */
+void xhci_find_new_dequeue_state(struct xhci_hcd *xhci,
+               unsigned int slot_id, unsigned int ep_index,
+               struct xhci_td *cur_td, struct xhci_dequeue_state *state)
+{
+       struct xhci_virt_device *dev = xhci->devs[slot_id];
+       struct xhci_ring *ep_ring = dev->ep_rings[ep_index];
+       struct xhci_generic_trb *trb;
+       struct xhci_ep_ctx *ep_ctx;
+       dma_addr_t addr;
+
+       state->new_cycle_state = 0;
+       xhci_dbg(xhci, "Finding segment containing stopped TRB.\n");
+       state->new_deq_seg = find_trb_seg(cur_td->start_seg,
+                       ep_ring->stopped_trb,
+                       &state->new_cycle_state);
+       if (!state->new_deq_seg)
+               BUG();
+       /* Dig out the cycle state saved by the xHC during the stop ep cmd */
+       xhci_dbg(xhci, "Finding endpoint context\n");
+       ep_ctx = xhci_get_ep_ctx(xhci, dev->out_ctx, ep_index);
+       state->new_cycle_state = 0x1 & ep_ctx->deq;
+
+       state->new_deq_ptr = cur_td->last_trb;
+       xhci_dbg(xhci, "Finding segment containing last TRB in TD.\n");
+       state->new_deq_seg = find_trb_seg(state->new_deq_seg,
+                       state->new_deq_ptr,
+                       &state->new_cycle_state);
+       if (!state->new_deq_seg)
+               BUG();
+
+       trb = &state->new_deq_ptr->generic;
+       if (TRB_TYPE(trb->field[3]) == TRB_LINK &&
+                               (trb->field[3] & LINK_TOGGLE))
+               state->new_cycle_state = ~(state->new_cycle_state) & 0x1;
+       next_trb(xhci, ep_ring, &state->new_deq_seg, &state->new_deq_ptr);
+
+       /* Don't update the ring cycle state for the producer (us). */
+       xhci_dbg(xhci, "New dequeue segment = %p (virtual)\n",
+                       state->new_deq_seg);
+       addr = xhci_trb_virt_to_dma(state->new_deq_seg, state->new_deq_ptr);
+       xhci_dbg(xhci, "New dequeue pointer = 0x%llx (DMA)\n",
+                       (unsigned long long) addr);
+       xhci_dbg(xhci, "Setting dequeue pointer in internal ring state.\n");
+       ep_ring->dequeue = state->new_deq_ptr;
+       ep_ring->deq_seg = state->new_deq_seg;
+}
+
+static void td_to_noop(struct xhci_hcd *xhci, struct xhci_ring *ep_ring,
+               struct xhci_td *cur_td)
+{
+       struct xhci_segment *cur_seg;
+       union xhci_trb *cur_trb;
+
+       for (cur_seg = cur_td->start_seg, cur_trb = cur_td->first_trb;
+                       true;
+                       next_trb(xhci, ep_ring, &cur_seg, &cur_trb)) {
+               if ((cur_trb->generic.field[3] & TRB_TYPE_BITMASK) ==
+                               TRB_TYPE(TRB_LINK)) {
+                       /* Unchain any chained Link TRBs, but
+                        * leave the pointers intact.
+                        */
+                       cur_trb->generic.field[3] &= ~TRB_CHAIN;
+                       xhci_dbg(xhci, "Cancel (unchain) link TRB\n");
+                       xhci_dbg(xhci, "Address = %p (0x%llx dma); "
+                                       "in seg %p (0x%llx dma)\n",
+                                       cur_trb,
+                                       (unsigned long long)xhci_trb_virt_to_dma(cur_seg, cur_trb),
+                                       cur_seg,
+                                       (unsigned long long)cur_seg->dma);
+               } else {
+                       cur_trb->generic.field[0] = 0;
+                       cur_trb->generic.field[1] = 0;
+                       cur_trb->generic.field[2] = 0;
+                       /* Preserve only the cycle bit of this TRB */
+                       cur_trb->generic.field[3] &= TRB_CYCLE;
+                       cur_trb->generic.field[3] |= TRB_TYPE(TRB_TR_NOOP);
+                       xhci_dbg(xhci, "Cancel TRB %p (0x%llx dma) "
+                                       "in seg %p (0x%llx dma)\n",
+                                       cur_trb,
+                                       (unsigned long long)xhci_trb_virt_to_dma(cur_seg, cur_trb),
+                                       cur_seg,
+                                       (unsigned long long)cur_seg->dma);
+               }
+               if (cur_trb == cur_td->last_trb)
+                       break;
+       }
+}
+
+static int queue_set_tr_deq(struct xhci_hcd *xhci, int slot_id,
+               unsigned int ep_index, struct xhci_segment *deq_seg,
+               union xhci_trb *deq_ptr, u32 cycle_state);
+
+void xhci_queue_new_dequeue_state(struct xhci_hcd *xhci,
+               struct xhci_ring *ep_ring, unsigned int slot_id,
+               unsigned int ep_index, struct xhci_dequeue_state *deq_state)
+{
+       xhci_dbg(xhci, "Set TR Deq Ptr cmd, new deq seg = %p (0x%llx dma), "
+                       "new deq ptr = %p (0x%llx dma), new cycle = %u\n",
+                       deq_state->new_deq_seg,
+                       (unsigned long long)deq_state->new_deq_seg->dma,
+                       deq_state->new_deq_ptr,
+                       (unsigned long long)xhci_trb_virt_to_dma(deq_state->new_deq_seg, deq_state->new_deq_ptr),
+                       deq_state->new_cycle_state);
+       queue_set_tr_deq(xhci, slot_id, ep_index,
+                       deq_state->new_deq_seg,
+                       deq_state->new_deq_ptr,
+                       (u32) deq_state->new_cycle_state);
+       /* Stop the TD queueing code from ringing the doorbell until
+        * this command completes.  The HC won't set the dequeue pointer
+        * if the ring is running, and ringing the doorbell starts the
+        * ring running.
+        */
+       ep_ring->state |= SET_DEQ_PENDING;
+       xhci_ring_cmd_db(xhci);
+}
+
+/*
+ * When we get a command completion for a Stop Endpoint Command, we need to
+ * unlink any cancelled TDs from the ring.  There are two ways to do that:
+ *
+ *  1. If the HW was in the middle of processing the TD that needs to be
+ *     cancelled, then we must move the ring's dequeue pointer past the last TRB
+ *     in the TD with a Set Dequeue Pointer Command.
+ *  2. Otherwise, we turn all the TRBs in the TD into No-op TRBs (with the chain
+ *     bit cleared) so that the HW will skip over them.
+ */
+static void handle_stopped_endpoint(struct xhci_hcd *xhci,
+               union xhci_trb *trb)
+{
+       unsigned int slot_id;
+       unsigned int ep_index;
+       struct xhci_ring *ep_ring;
+       struct list_head *entry;
+       struct xhci_td *cur_td = 0;
+       struct xhci_td *last_unlinked_td;
+
+       struct xhci_dequeue_state deq_state;
+#ifdef CONFIG_USB_HCD_STAT
+       ktime_t stop_time = ktime_get();
+#endif
+
+       memset(&deq_state, 0, sizeof(deq_state));
+       slot_id = TRB_TO_SLOT_ID(trb->generic.field[3]);
+       ep_index = TRB_TO_EP_INDEX(trb->generic.field[3]);
+       ep_ring = xhci->devs[slot_id]->ep_rings[ep_index];
+
+       if (list_empty(&ep_ring->cancelled_td_list))
+               return;
+
+       /* Fix up the ep ring first, so HW stops executing cancelled TDs.
+        * We have the xHCI lock, so nothing can modify this list until we drop
+        * it.  We're also in the event handler, so we can't get re-interrupted
+        * if another Stop Endpoint command completes
+        */
+       list_for_each(entry, &ep_ring->cancelled_td_list) {
+               cur_td = list_entry(entry, struct xhci_td, cancelled_td_list);
+               xhci_dbg(xhci, "Cancelling TD starting at %p, 0x%llx (dma).\n",
+                               cur_td->first_trb,
+                               (unsigned long long)xhci_trb_virt_to_dma(cur_td->start_seg, cur_td->first_trb));
+               /*
+                * If we stopped on the TD we need to cancel, then we have to
+                * move the xHC endpoint ring dequeue pointer past this TD.
+                */
+               if (cur_td == ep_ring->stopped_td)
+                       xhci_find_new_dequeue_state(xhci, slot_id, ep_index, cur_td,
+                                       &deq_state);
+               else
+                       td_to_noop(xhci, ep_ring, cur_td);
+               /*
+                * The event handler won't see a completion for this TD anymore,
+                * so remove it from the endpoint ring's TD list.  Keep it in
+                * the cancelled TD list for URB completion later.
+                */
+               list_del(&cur_td->td_list);
+               ep_ring->cancels_pending--;
+       }
+       last_unlinked_td = cur_td;
+
+       /* If necessary, queue a Set Transfer Ring Dequeue Pointer command */
+       if (deq_state.new_deq_ptr && deq_state.new_deq_seg) {
+               xhci_queue_new_dequeue_state(xhci, ep_ring,
+                               slot_id, ep_index, &deq_state);
+       } else {
+               /* Otherwise just ring the doorbell to restart the ring */
+               ring_ep_doorbell(xhci, slot_id, ep_index);
+       }
+
+       /*
+        * Drop the lock and complete the URBs in the cancelled TD list.
+        * New TDs to be cancelled might be added to the end of the list before
+        * we can complete all the URBs for the TDs we already unlinked.
+        * So stop when we've completed the URB for the last TD we unlinked.
+        */
+       do {
+               cur_td = list_entry(ep_ring->cancelled_td_list.next,
+                               struct xhci_td, cancelled_td_list);
+               list_del(&cur_td->cancelled_td_list);
+
+               /* Clean up the cancelled URB */
+#ifdef CONFIG_USB_HCD_STAT
+               hcd_stat_update(xhci->tp_stat, cur_td->urb->actual_length,
+                               ktime_sub(stop_time, cur_td->start_time));
+#endif
+               cur_td->urb->hcpriv = NULL;
+               usb_hcd_unlink_urb_from_ep(xhci_to_hcd(xhci), cur_td->urb);
+
+               xhci_dbg(xhci, "Giveback cancelled URB %p\n", cur_td->urb);
+               spin_unlock(&xhci->lock);
+               /* Doesn't matter what we pass for status, since the core will
+                * just overwrite it (because the URB has been unlinked).
+                */
+               usb_hcd_giveback_urb(xhci_to_hcd(xhci), cur_td->urb, 0);
+               kfree(cur_td);
+
+               spin_lock(&xhci->lock);
+       } while (cur_td != last_unlinked_td);
+
+       /* Return to the event handler with xhci->lock re-acquired */
+}
+
+/*
+ * When we get a completion for a Set Transfer Ring Dequeue Pointer command,
+ * we need to clear the set deq pending flag in the endpoint ring state, so that
+ * the TD queueing code can ring the doorbell again.  We also need to ring the
+ * endpoint doorbell to restart the ring, but only if there aren't more
+ * cancellations pending.
+ */
+static void handle_set_deq_completion(struct xhci_hcd *xhci,
+               struct xhci_event_cmd *event,
+               union xhci_trb *trb)
+{
+       unsigned int slot_id;
+       unsigned int ep_index;
+       struct xhci_ring *ep_ring;
+       struct xhci_virt_device *dev;
+       struct xhci_ep_ctx *ep_ctx;
+       struct xhci_slot_ctx *slot_ctx;
+
+       slot_id = TRB_TO_SLOT_ID(trb->generic.field[3]);
+       ep_index = TRB_TO_EP_INDEX(trb->generic.field[3]);
+       dev = xhci->devs[slot_id];
+       ep_ring = dev->ep_rings[ep_index];
+       ep_ctx = xhci_get_ep_ctx(xhci, dev->out_ctx, ep_index);
+       slot_ctx = xhci_get_slot_ctx(xhci, dev->out_ctx);
+
+       if (GET_COMP_CODE(event->status) != COMP_SUCCESS) {
+               unsigned int ep_state;
+               unsigned int slot_state;
+
+               switch (GET_COMP_CODE(event->status)) {
+               case COMP_TRB_ERR:
+                       xhci_warn(xhci, "WARN Set TR Deq Ptr cmd invalid because "
+                                       "of stream ID configuration\n");
+                       break;
+               case COMP_CTX_STATE:
+                       xhci_warn(xhci, "WARN Set TR Deq Ptr cmd failed due "
+                                       "to incorrect slot or ep state.\n");
+                       ep_state = ep_ctx->ep_info;
+                       ep_state &= EP_STATE_MASK;
+                       slot_state = slot_ctx->dev_state;
+                       slot_state = GET_SLOT_STATE(slot_state);
+                       xhci_dbg(xhci, "Slot state = %u, EP state = %u\n",
+                                       slot_state, ep_state);
+                       break;
+               case COMP_EBADSLT:
+                       xhci_warn(xhci, "WARN Set TR Deq Ptr cmd failed because "
+                                       "slot %u was not enabled.\n", slot_id);
+                       break;
+               default:
+                       xhci_warn(xhci, "WARN Set TR Deq Ptr cmd with unknown "
+                                       "completion code of %u.\n",
+                                       GET_COMP_CODE(event->status));
+                       break;
+               }
+               /* OK what do we do now?  The endpoint state is hosed, and we
+                * should never get to this point if the synchronization between
+                * queueing, and endpoint state are correct.  This might happen
+                * if the device gets disconnected after we've finished
+                * cancelling URBs, which might not be an error...
+                */
+       } else {
+               xhci_dbg(xhci, "Successful Set TR Deq Ptr cmd, deq = @%08llx\n",
+                               ep_ctx->deq);
+       }
+
+       ep_ring->state &= ~SET_DEQ_PENDING;
+       ring_ep_doorbell(xhci, slot_id, ep_index);
+}
+
+static void handle_reset_ep_completion(struct xhci_hcd *xhci,
+               struct xhci_event_cmd *event,
+               union xhci_trb *trb)
+{
+       int slot_id;
+       unsigned int ep_index;
+
+       slot_id = TRB_TO_SLOT_ID(trb->generic.field[3]);
+       ep_index = TRB_TO_EP_INDEX(trb->generic.field[3]);
+       /* This command will only fail if the endpoint wasn't halted,
+        * but we don't care.
+        */
+       xhci_dbg(xhci, "Ignoring reset ep completion code of %u\n",
+                       (unsigned int) GET_COMP_CODE(event->status));
+
+       /* Clear our internal halted state and restart the ring */
+       xhci->devs[slot_id]->ep_rings[ep_index]->state &= ~EP_HALTED;
+       ring_ep_doorbell(xhci, slot_id, ep_index);
+}
+
 static void handle_cmd_completion(struct xhci_hcd *xhci,
                struct xhci_event_cmd *event)
 {
@@ -256,8 +665,8 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
        u64 cmd_dma;
        dma_addr_t cmd_dequeue_dma;
 
-       cmd_dma = (((u64) event->cmd_trb[1]) << 32) + event->cmd_trb[0];
-       cmd_dequeue_dma = trb_virt_to_dma(xhci->cmd_ring->deq_seg,
+       cmd_dma = event->cmd_trb;
+       cmd_dequeue_dma = xhci_trb_virt_to_dma(xhci->cmd_ring->deq_seg,
                        xhci->cmd_ring->dequeue);
        /* Is the command ring deq ptr out of sync with the deq seg ptr? */
        if (cmd_dequeue_dma == 0) {
@@ -281,13 +690,26 @@ static void handle_cmd_completion(struct xhci_hcd *xhci,
                if (xhci->devs[slot_id])
                        xhci_free_virt_device(xhci, slot_id);
                break;
+       case TRB_TYPE(TRB_CONFIG_EP):
+               xhci->devs[slot_id]->cmd_status = GET_COMP_CODE(event->status);
+               complete(&xhci->devs[slot_id]->cmd_completion);
+               break;
        case TRB_TYPE(TRB_ADDR_DEV):
                xhci->devs[slot_id]->cmd_status = GET_COMP_CODE(event->status);
                complete(&xhci->addr_dev);
                break;
+       case TRB_TYPE(TRB_STOP_RING):
+               handle_stopped_endpoint(xhci, xhci->cmd_ring->dequeue);
+               break;
+       case TRB_TYPE(TRB_SET_DEQ):
+               handle_set_deq_completion(xhci, event, xhci->cmd_ring->dequeue);
+               break;
        case TRB_TYPE(TRB_CMD_NOOP):
                ++xhci->noops_handled;
                break;
+       case TRB_TYPE(TRB_RESET_EP):
+               handle_reset_ep_completion(xhci, event, xhci->cmd_ring->dequeue);
+               break;
        default:
                /* Skip over unknown commands on the event ring */
                xhci->error_bitmask |= 1 << 6;
@@ -312,7 +734,7 @@ static void handle_port_status(struct xhci_hcd *xhci,
 
        /* Update event ring dequeue pointer before dropping the lock */
        inc_deq(xhci, xhci->event_ring, true);
-       set_hc_event_deq(xhci);
+       xhci_set_hc_event_deq(xhci);
 
        spin_unlock(&xhci->lock);
        /* Pass this up to the core */
@@ -337,17 +759,15 @@ static struct xhci_segment *trb_in_td(
        dma_addr_t end_trb_dma;
        struct xhci_segment *cur_seg;
 
-       start_dma = trb_virt_to_dma(start_seg, start_trb);
+       start_dma = xhci_trb_virt_to_dma(start_seg, start_trb);
        cur_seg = start_seg;
 
        do {
-               /*
-                * Last TRB is a link TRB (unless we start inserting links in
-                * the middle, FIXME if you do)
-                */
-               end_seg_dma = trb_virt_to_dma(cur_seg, &start_seg->trbs[TRBS_PER_SEGMENT - 2]);
+               /* We may get an event for a Link TRB in the middle of a TD */
+               end_seg_dma = xhci_trb_virt_to_dma(cur_seg,
+                               &start_seg->trbs[TRBS_PER_SEGMENT - 1]);
                /* If the end TRB isn't in this segment, this is set to 0 */
-               end_trb_dma = trb_virt_to_dma(cur_seg, end_trb);
+               end_trb_dma = xhci_trb_virt_to_dma(cur_seg, end_trb);
 
                if (end_trb_dma > 0) {
                        /* The end TRB is in this segment, so suspect should be here */
@@ -371,7 +791,7 @@ static struct xhci_segment *trb_in_td(
                                return cur_seg;
                }
                cur_seg = cur_seg->next;
-               start_dma = trb_virt_to_dma(cur_seg, &cur_seg->trbs[0]);
+               start_dma = xhci_trb_virt_to_dma(cur_seg, &cur_seg->trbs[0]);
        } while (1);
 
 }
@@ -391,9 +811,11 @@ static int handle_tx_event(struct xhci_hcd *xhci,
        dma_addr_t event_dma;
        struct xhci_segment *event_seg;
        union xhci_trb *event_trb;
-       struct urb *urb = NULL;
+       struct urb *urb = 0;
        int status = -EINPROGRESS;
+       struct xhci_ep_ctx *ep_ctx;
 
+       xhci_dbg(xhci, "In %s\n", __func__);
        xdev = xhci->devs[TRB_TO_SLOT_ID(event->flags)];
        if (!xdev) {
                xhci_err(xhci, "ERROR Transfer event pointed to bad slot\n");
@@ -402,17 +824,17 @@ static int handle_tx_event(struct xhci_hcd *xhci,
 
        /* Endpoint ID is 1 based, our index is zero based */
        ep_index = TRB_TO_EP_ID(event->flags) - 1;
+       xhci_dbg(xhci, "%s - ep index = %d\n", __func__, ep_index);
        ep_ring = xdev->ep_rings[ep_index];
-       if (!ep_ring || (xdev->out_ctx->ep[ep_index].ep_info & EP_STATE_MASK) == EP_STATE_DISABLED) {
+       ep_ctx = xhci_get_ep_ctx(xhci, xdev->out_ctx, ep_index);
+       if (!ep_ring || (ep_ctx->ep_info & EP_STATE_MASK) == EP_STATE_DISABLED) {
                xhci_err(xhci, "ERROR Transfer event pointed to disabled endpoint\n");
                return -ENODEV;
        }
 
-       event_dma = event->buffer[0];
-       if (event->buffer[1] != 0)
-               xhci_warn(xhci, "WARN ignoring upper 32-bits of 64-bit TRB dma address\n");
-
+       event_dma = event->buffer;
        /* This TRB should be in the TD at the head of this ring's TD list */
+       xhci_dbg(xhci, "%s - checking for list empty\n", __func__);
        if (list_empty(&ep_ring->td_list)) {
                xhci_warn(xhci, "WARN Event TRB for slot %d ep %d with no TDs queued?\n",
                                TRB_TO_SLOT_ID(event->flags), ep_index);
@@ -422,18 +844,71 @@ static int handle_tx_event(struct xhci_hcd *xhci,
                urb = NULL;
                goto cleanup;
        }
+       xhci_dbg(xhci, "%s - getting list entry\n", __func__);
        td = list_entry(ep_ring->td_list.next, struct xhci_td, td_list);
 
        /* Is this a TRB in the currently executing TD? */
+       xhci_dbg(xhci, "%s - looking for TD\n", __func__);
        event_seg = trb_in_td(ep_ring->deq_seg, ep_ring->dequeue,
                        td->last_trb, event_dma);
+       xhci_dbg(xhci, "%s - found event_seg = %p\n", __func__, event_seg);
        if (!event_seg) {
                /* HC is busted, give up! */
                xhci_err(xhci, "ERROR Transfer event TRB DMA ptr not part of current TD\n");
                return -ESHUTDOWN;
        }
        event_trb = &event_seg->trbs[(event_dma - event_seg->dma) / sizeof(*event_trb)];
-
+       xhci_dbg(xhci, "Event TRB with TRB type ID %u\n",
+                       (unsigned int) (event->flags & TRB_TYPE_BITMASK)>>10);
+       xhci_dbg(xhci, "Offset 0x00 (buffer lo) = 0x%x\n",
+                       lower_32_bits(event->buffer));
+       xhci_dbg(xhci, "Offset 0x04 (buffer hi) = 0x%x\n",
+                       upper_32_bits(event->buffer));
+       xhci_dbg(xhci, "Offset 0x08 (transfer length) = 0x%x\n",
+                       (unsigned int) event->transfer_len);
+       xhci_dbg(xhci, "Offset 0x0C (flags) = 0x%x\n",
+                       (unsigned int) event->flags);
+
+       /* Look for common error cases */
+       switch (GET_COMP_CODE(event->transfer_len)) {
+       /* Skip codes that require special handling depending on
+        * transfer type
+        */
+       case COMP_SUCCESS:
+       case COMP_SHORT_TX:
+               break;
+       case COMP_STOP:
+               xhci_dbg(xhci, "Stopped on Transfer TRB\n");
+               break;
+       case COMP_STOP_INVAL:
+               xhci_dbg(xhci, "Stopped on No-op or Link TRB\n");
+               break;
+       case COMP_STALL:
+               xhci_warn(xhci, "WARN: Stalled endpoint\n");
+               ep_ring->state |= EP_HALTED;
+               status = -EPIPE;
+               break;
+       case COMP_TRB_ERR:
+               xhci_warn(xhci, "WARN: TRB error on endpoint\n");
+               status = -EILSEQ;
+               break;
+       case COMP_TX_ERR:
+               xhci_warn(xhci, "WARN: transfer error on endpoint\n");
+               status = -EPROTO;
+               break;
+       case COMP_BABBLE:
+               xhci_warn(xhci, "WARN: babble error on endpoint\n");
+               status = -EOVERFLOW;
+               break;
+       case COMP_DB_ERR:
+               xhci_warn(xhci, "WARN: HC couldn't access mem fast enough\n");
+               status = -ENOSR;
+               break;
+       default:
+               xhci_warn(xhci, "ERROR Unknown event condition, HC probably busted\n");
+               urb = NULL;
+               goto cleanup;
+       }
        /* Now update the urb's actual_length and give back to the core */
        /* Was this a control transfer? */
        if (usb_endpoint_xfer_control(&td->urb->ep->desc)) {
@@ -455,25 +930,9 @@ static int handle_tx_event(struct xhci_hcd *xhci,
                        xhci_warn(xhci, "WARN: short transfer on control ep\n");
                        status = -EREMOTEIO;
                        break;
-               case COMP_STALL:
-                       xhci_warn(xhci, "WARN: Stalled control ep\n");
-                       status = -EPIPE;
-                       break;
-               case COMP_TRB_ERR:
-                       xhci_warn(xhci, "WARN: TRB error on control ep\n");
-                       status = -EILSEQ;
-                       break;
-               case COMP_TX_ERR:
-                       xhci_warn(xhci, "WARN: transfer error on control ep\n");
-                       status = -EPROTO;
-                       break;
-               case COMP_DB_ERR:
-                       xhci_warn(xhci, "WARN: HC couldn't access mem fast enough on control TX\n");
-                       status = -ENOSR;
-                       break;
                default:
-                       xhci_dbg(xhci, "ERROR Unknown event condition, HC probably busted\n");
-                       goto cleanup;
+                       /* Others already handled above */
+                       break;
                }
                /*
                 * Did we transfer any data, despite the errors that might have
@@ -482,30 +941,160 @@ static int handle_tx_event(struct xhci_hcd *xhci,
                if (event_trb != ep_ring->dequeue) {
                        /* The event was for the status stage */
                        if (event_trb == td->last_trb) {
-                               td->urb->actual_length = td->urb->transfer_buffer_length;
+                               if (td->urb->actual_length != 0) {
+                                       /* Don't overwrite a previously set error code */
+                                       if (status == -EINPROGRESS || status == 0)
+                                               /* Did we already see a short data stage? */
+                                               status = -EREMOTEIO;
+                               } else {
+                                       td->urb->actual_length =
+                                               td->urb->transfer_buffer_length;
+                               }
                        } else {
-                       /* The event was for the data stage */
-                               td->urb->actual_length = td->urb->transfer_buffer_length -
+                       /* Maybe the event was for the data stage? */
+                               if (GET_COMP_CODE(event->transfer_len) != COMP_STOP_INVAL) {
+                                       /* We didn't stop on a link TRB in the middle */
+                                       td->urb->actual_length =
+                                               td->urb->transfer_buffer_length -
+                                               TRB_LEN(event->transfer_len);
+                                       xhci_dbg(xhci, "Waiting for status stage event\n");
+                                       urb = NULL;
+                                       goto cleanup;
+                               }
+                       }
+               }
+       } else {
+               switch (GET_COMP_CODE(event->transfer_len)) {
+               case COMP_SUCCESS:
+                       /* Double check that the HW transferred everything. */
+                       if (event_trb != td->last_trb) {
+                               xhci_warn(xhci, "WARN Successful completion "
+                                               "on short TX\n");
+                               if (td->urb->transfer_flags & URB_SHORT_NOT_OK)
+                                       status = -EREMOTEIO;
+                               else
+                                       status = 0;
+                       } else {
+                               xhci_dbg(xhci, "Successful bulk transfer!\n");
+                               status = 0;
+                       }
+                       break;
+               case COMP_SHORT_TX:
+                       if (td->urb->transfer_flags & URB_SHORT_NOT_OK)
+                               status = -EREMOTEIO;
+                       else
+                               status = 0;
+                       break;
+               default:
+                       /* Others already handled above */
+                       break;
+               }
+               dev_dbg(&td->urb->dev->dev,
+                               "ep %#x - asked for %d bytes, "
+                               "%d bytes untransferred\n",
+                               td->urb->ep->desc.bEndpointAddress,
+                               td->urb->transfer_buffer_length,
+                               TRB_LEN(event->transfer_len));
+               /* Fast path - was this the last TRB in the TD for this URB? */
+               if (event_trb == td->last_trb) {
+                       if (TRB_LEN(event->transfer_len) != 0) {
+                               td->urb->actual_length =
+                                       td->urb->transfer_buffer_length -
                                        TRB_LEN(event->transfer_len);
+                               if (td->urb->actual_length < 0) {
+                                       xhci_warn(xhci, "HC gave bad length "
+                                                       "of %d bytes left\n",
+                                                       TRB_LEN(event->transfer_len));
+                                       td->urb->actual_length = 0;
+                               }
+                               /* Don't overwrite a previously set error code */
+                               if (status == -EINPROGRESS) {
+                                       if (td->urb->transfer_flags & URB_SHORT_NOT_OK)
+                                               status = -EREMOTEIO;
+                                       else
+                                               status = 0;
+                               }
+                       } else {
+                               td->urb->actual_length = td->urb->transfer_buffer_length;
+                               /* Ignore a short packet completion if the
+                                * untransferred length was zero.
+                                */
+                               if (status == -EREMOTEIO)
+                                       status = 0;
+                       }
+               } else {
+                       /* Slow path - walk the list, starting from the dequeue
+                        * pointer, to get the actual length transferred.
+                        */
+                       union xhci_trb *cur_trb;
+                       struct xhci_segment *cur_seg;
+
+                       td->urb->actual_length = 0;
+                       for (cur_trb = ep_ring->dequeue, cur_seg = ep_ring->deq_seg;
+                                       cur_trb != event_trb;
+                                       next_trb(xhci, ep_ring, &cur_seg, &cur_trb)) {
+                               if (TRB_TYPE(cur_trb->generic.field[3]) != TRB_TR_NOOP &&
+                                               TRB_TYPE(cur_trb->generic.field[3]) != TRB_LINK)
+                                       td->urb->actual_length +=
+                                               TRB_LEN(cur_trb->generic.field[2]);
                        }
+                       /* If the ring didn't stop on a Link or No-op TRB, add
+                        * in the actual bytes transferred from the Normal TRB
+                        */
+                       if (GET_COMP_CODE(event->transfer_len) != COMP_STOP_INVAL)
+                               td->urb->actual_length +=
+                                       TRB_LEN(cur_trb->generic.field[2]) -
+                                       TRB_LEN(event->transfer_len);
                }
-               while (ep_ring->dequeue != td->last_trb)
+       }
+       if (GET_COMP_CODE(event->transfer_len) == COMP_STOP_INVAL ||
+                       GET_COMP_CODE(event->transfer_len) == COMP_STOP) {
+               /* The Endpoint Stop Command completion will take care of any
+                * stopped TDs.  A stopped TD may be restarted, so don't update
+                * the ring dequeue pointer or take this TD off any lists yet.
+                */
+               ep_ring->stopped_td = td;
+               ep_ring->stopped_trb = event_trb;
+       } else {
+               if (GET_COMP_CODE(event->transfer_len) == COMP_STALL) {
+                       /* The transfer is completed from the driver's
+                        * perspective, but we need to issue a set dequeue
+                        * command for this stalled endpoint to move the dequeue
+                        * pointer past the TD.  We can't do that here because
+                        * the halt condition must be cleared first.
+                        */
+                       ep_ring->stopped_td = td;
+                       ep_ring->stopped_trb = event_trb;
+               } else {
+                       /* Update ring dequeue pointer */
+                       while (ep_ring->dequeue != td->last_trb)
+                               inc_deq(xhci, ep_ring, false);
                        inc_deq(xhci, ep_ring, false);
-               inc_deq(xhci, ep_ring, false);
+               }
 
                /* Clean up the endpoint's TD list */
                urb = td->urb;
                list_del(&td->td_list);
-               kfree(td);
-       } else {
-               xhci_dbg(xhci, "FIXME do something for non-control transfers\n");
+               /* Was this TD slated to be cancelled but completed anyway? */
+               if (!list_empty(&td->cancelled_td_list)) {
+                       list_del(&td->cancelled_td_list);
+                       ep_ring->cancels_pending--;
+               }
+               /* Leave the TD around for the reset endpoint function to use */
+               if (GET_COMP_CODE(event->transfer_len) != COMP_STALL) {
+                       kfree(td);
+               }
+               urb->hcpriv = NULL;
        }
 cleanup:
        inc_deq(xhci, xhci->event_ring, true);
-       set_hc_event_deq(xhci);
+       xhci_set_hc_event_deq(xhci);
 
+       /* FIXME for multi-TD URBs (who have buffers bigger than 64MB) */
        if (urb) {
                usb_hcd_unlink_urb_from_ep(xhci_to_hcd(xhci), urb);
+               xhci_dbg(xhci, "Giveback URB %p, len = %d, status = %d\n",
+                               urb, td->urb->actual_length, status);
                spin_unlock(&xhci->lock);
                usb_hcd_giveback_urb(xhci_to_hcd(xhci), urb, status);
                spin_lock(&xhci->lock);
@@ -517,12 +1106,13 @@ cleanup:
  * This function handles all OS-owned events on the event ring.  It may drop
  * xhci->lock between event processing (e.g. to pass up port status changes).
  */
-void handle_event(struct xhci_hcd *xhci)
+void xhci_handle_event(struct xhci_hcd *xhci)
 {
        union xhci_trb *event;
        int update_ptrs = 1;
        int ret;
 
+       xhci_dbg(xhci, "In %s\n", __func__);
        if (!xhci->event_ring || !xhci->event_ring->dequeue) {
                xhci->error_bitmask |= 1 << 1;
                return;
@@ -535,18 +1125,25 @@ void handle_event(struct xhci_hcd *xhci)
                xhci->error_bitmask |= 1 << 2;
                return;
        }
+       xhci_dbg(xhci, "%s - OS owns TRB\n", __func__);
 
        /* FIXME: Handle more event types. */
        switch ((event->event_cmd.flags & TRB_TYPE_BITMASK)) {
        case TRB_TYPE(TRB_COMPLETION):
+               xhci_dbg(xhci, "%s - calling handle_cmd_completion\n", __func__);
                handle_cmd_completion(xhci, &event->event_cmd);
+               xhci_dbg(xhci, "%s - returned from handle_cmd_completion\n", __func__);
                break;
        case TRB_TYPE(TRB_PORT_STATUS):
+               xhci_dbg(xhci, "%s - calling handle_port_status\n", __func__);
                handle_port_status(xhci, event);
+               xhci_dbg(xhci, "%s - returned from handle_port_status\n", __func__);
                update_ptrs = 0;
                break;
        case TRB_TYPE(TRB_TRANSFER):
+               xhci_dbg(xhci, "%s - calling handle_tx_event\n", __func__);
                ret = handle_tx_event(xhci, &event->trans_event);
+               xhci_dbg(xhci, "%s - returned from handle_tx_event\n", __func__);
                if (ret < 0)
                        xhci->error_bitmask |= 1 << 9;
                else
@@ -559,10 +1156,10 @@ void handle_event(struct xhci_hcd *xhci)
        if (update_ptrs) {
                /* Update SW and HC event ring dequeue pointer */
                inc_deq(xhci, xhci->event_ring, true);
-               set_hc_event_deq(xhci);
+               xhci_set_hc_event_deq(xhci);
        }
        /* Are there more items on the event ring? */
-       handle_event(xhci);
+       xhci_handle_event(xhci);
 }
 
 /****          Endpoint Ring Operations        ****/
@@ -602,13 +1199,13 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring,
                 */
                xhci_warn(xhci, "WARN urb submitted to disabled ep\n");
                return -ENOENT;
-       case EP_STATE_HALTED:
        case EP_STATE_ERROR:
-               xhci_warn(xhci, "WARN waiting for halt or error on ep "
-                               "to be cleared\n");
+               xhci_warn(xhci, "WARN waiting for error on ep to be cleared\n");
                /* FIXME event handling code for error needs to clear it */
                /* XXX not sure if this should be -ENOENT or not */
                return -EINVAL;
+       case EP_STATE_HALTED:
+               xhci_dbg(xhci, "WARN halted endpoint, queueing URB anyway.\n");
        case EP_STATE_STOPPED:
        case EP_STATE_RUNNING:
                break;
@@ -628,7 +1225,7 @@ static int prepare_ring(struct xhci_hcd *xhci, struct xhci_ring *ep_ring,
        return 0;
 }
 
-int xhci_prepare_transfer(struct xhci_hcd *xhci,
+static int prepare_transfer(struct xhci_hcd *xhci,
                struct xhci_virt_device *xdev,
                unsigned int ep_index,
                unsigned int num_trbs,
@@ -637,9 +1234,9 @@ int xhci_prepare_transfer(struct xhci_hcd *xhci,
                gfp_t mem_flags)
 {
        int ret;
-
+       struct xhci_ep_ctx *ep_ctx = xhci_get_ep_ctx(xhci, xdev->out_ctx, ep_index);
        ret = prepare_ring(xhci, xdev->ep_rings[ep_index],
-                       xdev->out_ctx->ep[ep_index].ep_info & EP_STATE_MASK,
+                       ep_ctx->ep_info & EP_STATE_MASK,
                        num_trbs, mem_flags);
        if (ret)
                return ret;
@@ -647,6 +1244,7 @@ int xhci_prepare_transfer(struct xhci_hcd *xhci,
        if (!*td)
                return -ENOMEM;
        INIT_LIST_HEAD(&(*td)->td_list);
+       INIT_LIST_HEAD(&(*td)->cancelled_td_list);
 
        ret = usb_hcd_link_urb_to_ep(xhci_to_hcd(xhci), urb);
        if (unlikely(ret)) {
@@ -658,12 +1256,333 @@ int xhci_prepare_transfer(struct xhci_hcd *xhci,
        urb->hcpriv = (void *) (*td);
        /* Add this TD to the tail of the endpoint ring's TD list */
        list_add_tail(&(*td)->td_list, &xdev->ep_rings[ep_index]->td_list);
+       (*td)->start_seg = xdev->ep_rings[ep_index]->enq_seg;
+       (*td)->first_trb = xdev->ep_rings[ep_index]->enqueue;
 
        return 0;
 }
 
+static unsigned int count_sg_trbs_needed(struct xhci_hcd *xhci, struct urb *urb)
+{
+       int num_sgs, num_trbs, running_total, temp, i;
+       struct scatterlist *sg;
+
+       sg = NULL;
+       num_sgs = urb->num_sgs;
+       temp = urb->transfer_buffer_length;
+
+       xhci_dbg(xhci, "count sg list trbs: \n");
+       num_trbs = 0;
+       for_each_sg(urb->sg->sg, sg, num_sgs, i) {
+               unsigned int previous_total_trbs = num_trbs;
+               unsigned int len = sg_dma_len(sg);
+
+               /* Scatter gather list entries may cross 64KB boundaries */
+               running_total = TRB_MAX_BUFF_SIZE -
+                       (sg_dma_address(sg) & ((1 << TRB_MAX_BUFF_SHIFT) - 1));
+               if (running_total != 0)
+                       num_trbs++;
+
+               /* How many more 64KB chunks to transfer, how many more TRBs? */
+               while (running_total < sg_dma_len(sg)) {
+                       num_trbs++;
+                       running_total += TRB_MAX_BUFF_SIZE;
+               }
+               xhci_dbg(xhci, " sg #%d: dma = %#llx, len = %#x (%d), num_trbs = %d\n",
+                               i, (unsigned long long)sg_dma_address(sg),
+                               len, len, num_trbs - previous_total_trbs);
+
+               len = min_t(int, len, temp);
+               temp -= len;
+               if (temp == 0)
+                       break;
+       }
+       xhci_dbg(xhci, "\n");
+       if (!in_interrupt())
+               dev_dbg(&urb->dev->dev, "ep %#x - urb len = %d, sglist used, num_trbs = %d\n",
+                               urb->ep->desc.bEndpointAddress,
+                               urb->transfer_buffer_length,
+                               num_trbs);
+       return num_trbs;
+}
+
+static void check_trb_math(struct urb *urb, int num_trbs, int running_total)
+{
+       if (num_trbs != 0)
+               dev_dbg(&urb->dev->dev, "%s - ep %#x - Miscalculated number of "
+                               "TRBs, %d left\n", __func__,
+                               urb->ep->desc.bEndpointAddress, num_trbs);
+       if (running_total != urb->transfer_buffer_length)
+               dev_dbg(&urb->dev->dev, "%s - ep %#x - Miscalculated tx length, "
+                               "queued %#x (%d), asked for %#x (%d)\n",
+                               __func__,
+                               urb->ep->desc.bEndpointAddress,
+                               running_total, running_total,
+                               urb->transfer_buffer_length,
+                               urb->transfer_buffer_length);
+}
+
+static void giveback_first_trb(struct xhci_hcd *xhci, int slot_id,
+               unsigned int ep_index, int start_cycle,
+               struct xhci_generic_trb *start_trb, struct xhci_td *td)
+{
+       /*
+        * Pass all the TRBs to the hardware at once and make sure this write
+        * isn't reordered.
+        */
+       wmb();
+       start_trb->field[3] |= start_cycle;
+       ring_ep_doorbell(xhci, slot_id, ep_index);
+}
+
+static int queue_bulk_sg_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
+               struct urb *urb, int slot_id, unsigned int ep_index)
+{
+       struct xhci_ring *ep_ring;
+       unsigned int num_trbs;
+       struct xhci_td *td;
+       struct scatterlist *sg;
+       int num_sgs;
+       int trb_buff_len, this_sg_len, running_total;
+       bool first_trb;
+       u64 addr;
+
+       struct xhci_generic_trb *start_trb;
+       int start_cycle;
+
+       ep_ring = xhci->devs[slot_id]->ep_rings[ep_index];
+       num_trbs = count_sg_trbs_needed(xhci, urb);
+       num_sgs = urb->num_sgs;
+
+       trb_buff_len = prepare_transfer(xhci, xhci->devs[slot_id],
+                       ep_index, num_trbs, urb, &td, mem_flags);
+       if (trb_buff_len < 0)
+               return trb_buff_len;
+       /*
+        * Don't give the first TRB to the hardware (by toggling the cycle bit)
+        * until we've finished creating all the other TRBs.  The ring's cycle
+        * state may change as we enqueue the other TRBs, so save it too.
+        */
+       start_trb = &ep_ring->enqueue->generic;
+       start_cycle = ep_ring->cycle_state;
+
+       running_total = 0;
+       /*
+        * How much data is in the first TRB?
+        *
+        * There are three forces at work for TRB buffer pointers and lengths:
+        * 1. We don't want to walk off the end of this sg-list entry buffer.
+        * 2. The transfer length that the driver requested may be smaller than
+        *    the amount of memory allocated for this scatter-gather list.
+        * 3. TRBs buffers can't cross 64KB boundaries.
+        */
+       sg = urb->sg->sg;
+       addr = (u64) sg_dma_address(sg);
+       this_sg_len = sg_dma_len(sg);
+       trb_buff_len = TRB_MAX_BUFF_SIZE -
+               (addr & ((1 << TRB_MAX_BUFF_SHIFT) - 1));
+       trb_buff_len = min_t(int, trb_buff_len, this_sg_len);
+       if (trb_buff_len > urb->transfer_buffer_length)
+               trb_buff_len = urb->transfer_buffer_length;
+       xhci_dbg(xhci, "First length to xfer from 1st sglist entry = %u\n",
+                       trb_buff_len);
+
+       first_trb = true;
+       /* Queue the first TRB, even if it's zero-length */
+       do {
+               u32 field = 0;
+               u32 length_field = 0;
+
+               /* Don't change the cycle bit of the first TRB until later */
+               if (first_trb)
+                       first_trb = false;
+               else
+                       field |= ep_ring->cycle_state;
+
+               /* Chain all the TRBs together; clear the chain bit in the last
+                * TRB to indicate it's the last TRB in the chain.
+                */
+               if (num_trbs > 1) {
+                       field |= TRB_CHAIN;
+               } else {
+                       /* FIXME - add check for ZERO_PACKET flag before this */
+                       td->last_trb = ep_ring->enqueue;
+                       field |= TRB_IOC;
+               }
+               xhci_dbg(xhci, " sg entry: dma = %#x, len = %#x (%d), "
+                               "64KB boundary at %#x, end dma = %#x\n",
+                               (unsigned int) addr, trb_buff_len, trb_buff_len,
+                               (unsigned int) (addr + TRB_MAX_BUFF_SIZE) & ~(TRB_MAX_BUFF_SIZE - 1),
+                               (unsigned int) addr + trb_buff_len);
+               if (TRB_MAX_BUFF_SIZE -
+                               (addr & ((1 << TRB_MAX_BUFF_SHIFT) - 1)) < trb_buff_len) {
+                       xhci_warn(xhci, "WARN: sg dma xfer crosses 64KB boundaries!\n");
+                       xhci_dbg(xhci, "Next boundary at %#x, end dma = %#x\n",
+                                       (unsigned int) (addr + TRB_MAX_BUFF_SIZE) & ~(TRB_MAX_BUFF_SIZE - 1),
+                                       (unsigned int) addr + trb_buff_len);
+               }
+               length_field = TRB_LEN(trb_buff_len) |
+                       TD_REMAINDER(urb->transfer_buffer_length - running_total) |
+                       TRB_INTR_TARGET(0);
+               queue_trb(xhci, ep_ring, false,
+                               lower_32_bits(addr),
+                               upper_32_bits(addr),
+                               length_field,
+                               /* We always want to know if the TRB was short,
+                                * or we won't get an event when it completes.
+                                * (Unless we use event data TRBs, which are a
+                                * waste of space and HC resources.)
+                                */
+                               field | TRB_ISP | TRB_TYPE(TRB_NORMAL));
+               --num_trbs;
+               running_total += trb_buff_len;
+
+               /* Calculate length for next transfer --
+                * Are we done queueing all the TRBs for this sg entry?
+                */
+               this_sg_len -= trb_buff_len;
+               if (this_sg_len == 0) {
+                       --num_sgs;
+                       if (num_sgs == 0)
+                               break;
+                       sg = sg_next(sg);
+                       addr = (u64) sg_dma_address(sg);
+                       this_sg_len = sg_dma_len(sg);
+               } else {
+                       addr += trb_buff_len;
+               }
+
+               trb_buff_len = TRB_MAX_BUFF_SIZE -
+                       (addr & ((1 << TRB_MAX_BUFF_SHIFT) - 1));
+               trb_buff_len = min_t(int, trb_buff_len, this_sg_len);
+               if (running_total + trb_buff_len > urb->transfer_buffer_length)
+                       trb_buff_len =
+                               urb->transfer_buffer_length - running_total;
+       } while (running_total < urb->transfer_buffer_length);
+
+       check_trb_math(urb, num_trbs, running_total);
+       giveback_first_trb(xhci, slot_id, ep_index, start_cycle, start_trb, td);
+       return 0;
+}
+
+/* This is very similar to what ehci-q.c qtd_fill() does */
+int xhci_queue_bulk_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
+               struct urb *urb, int slot_id, unsigned int ep_index)
+{
+       struct xhci_ring *ep_ring;
+       struct xhci_td *td;
+       int num_trbs;
+       struct xhci_generic_trb *start_trb;
+       bool first_trb;
+       int start_cycle;
+       u32 field, length_field;
+
+       int running_total, trb_buff_len, ret;
+       u64 addr;
+
+       if (urb->sg)
+               return queue_bulk_sg_tx(xhci, mem_flags, urb, slot_id, ep_index);
+
+       ep_ring = xhci->devs[slot_id]->ep_rings[ep_index];
+
+       num_trbs = 0;
+       /* How much data is (potentially) left before the 64KB boundary? */
+       running_total = TRB_MAX_BUFF_SIZE -
+               (urb->transfer_dma & ((1 << TRB_MAX_BUFF_SHIFT) - 1));
+
+       /* If there's some data on this 64KB chunk, or we have to send a
+        * zero-length transfer, we need at least one TRB
+        */
+       if (running_total != 0 || urb->transfer_buffer_length == 0)
+               num_trbs++;
+       /* How many more 64KB chunks to transfer, how many more TRBs? */
+       while (running_total < urb->transfer_buffer_length) {
+               num_trbs++;
+               running_total += TRB_MAX_BUFF_SIZE;
+       }
+       /* FIXME: this doesn't deal with URB_ZERO_PACKET - need one more */
+
+       if (!in_interrupt())
+               dev_dbg(&urb->dev->dev, "ep %#x - urb len = %#x (%d), addr = %#llx, num_trbs = %d\n",
+                               urb->ep->desc.bEndpointAddress,
+                               urb->transfer_buffer_length,
+                               urb->transfer_buffer_length,
+                               (unsigned long long)urb->transfer_dma,
+                               num_trbs);
+
+       ret = prepare_transfer(xhci, xhci->devs[slot_id], ep_index,
+                       num_trbs, urb, &td, mem_flags);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Don't give the first TRB to the hardware (by toggling the cycle bit)
+        * until we've finished creating all the other TRBs.  The ring's cycle
+        * state may change as we enqueue the other TRBs, so save it too.
+        */
+       start_trb = &ep_ring->enqueue->generic;
+       start_cycle = ep_ring->cycle_state;
+
+       running_total = 0;
+       /* How much data is in the first TRB? */
+       addr = (u64) urb->transfer_dma;
+       trb_buff_len = TRB_MAX_BUFF_SIZE -
+               (urb->transfer_dma & ((1 << TRB_MAX_BUFF_SHIFT) - 1));
+       if (urb->transfer_buffer_length < trb_buff_len)
+               trb_buff_len = urb->transfer_buffer_length;
+
+       first_trb = true;
+
+       /* Queue the first TRB, even if it's zero-length */
+       do {
+               field = 0;
+
+               /* Don't change the cycle bit of the first TRB until later */
+               if (first_trb)
+                       first_trb = false;
+               else
+                       field |= ep_ring->cycle_state;
+
+               /* Chain all the TRBs together; clear the chain bit in the last
+                * TRB to indicate it's the last TRB in the chain.
+                */
+               if (num_trbs > 1) {
+                       field |= TRB_CHAIN;
+               } else {
+                       /* FIXME - add check for ZERO_PACKET flag before this */
+                       td->last_trb = ep_ring->enqueue;
+                       field |= TRB_IOC;
+               }
+               length_field = TRB_LEN(trb_buff_len) |
+                       TD_REMAINDER(urb->transfer_buffer_length - running_total) |
+                       TRB_INTR_TARGET(0);
+               queue_trb(xhci, ep_ring, false,
+                               lower_32_bits(addr),
+                               upper_32_bits(addr),
+                               length_field,
+                               /* We always want to know if the TRB was short,
+                                * or we won't get an event when it completes.
+                                * (Unless we use event data TRBs, which are a
+                                * waste of space and HC resources.)
+                                */
+                               field | TRB_ISP | TRB_TYPE(TRB_NORMAL));
+               --num_trbs;
+               running_total += trb_buff_len;
+
+               /* Calculate length for next transfer */
+               addr += trb_buff_len;
+               trb_buff_len = urb->transfer_buffer_length - running_total;
+               if (trb_buff_len > TRB_MAX_BUFF_SIZE)
+                       trb_buff_len = TRB_MAX_BUFF_SIZE;
+       } while (running_total < urb->transfer_buffer_length);
+
+       check_trb_math(urb, num_trbs, running_total);
+       giveback_first_trb(xhci, slot_id, ep_index, start_cycle, start_trb, td);
+       return 0;
+}
+
 /* Caller must have locked xhci->lock */
-int queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
+int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
                struct urb *urb, int slot_id, unsigned int ep_index)
 {
        struct xhci_ring *ep_ring;
@@ -672,7 +1591,7 @@ int queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
        struct usb_ctrlrequest *setup;
        struct xhci_generic_trb *start_trb;
        int start_cycle;
-       u32 field;
+       u32 field, length_field;
        struct xhci_td *td;
 
        ep_ring = xhci->devs[slot_id]->ep_rings[ep_index];
@@ -696,7 +1615,7 @@ int queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
         */
        if (urb->transfer_buffer_length > 0)
                num_trbs++;
-       ret = xhci_prepare_transfer(xhci, xhci->devs[slot_id], ep_index, num_trbs,
+       ret = prepare_transfer(xhci, xhci->devs[slot_id], ep_index, num_trbs,
                        urb, &td, mem_flags);
        if (ret < 0)
                return ret;
@@ -722,13 +1641,16 @@ int queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
 
        /* If there's data, queue data TRBs */
        field = 0;
+       length_field = TRB_LEN(urb->transfer_buffer_length) |
+               TD_REMAINDER(urb->transfer_buffer_length) |
+               TRB_INTR_TARGET(0);
        if (urb->transfer_buffer_length > 0) {
                if (setup->bRequestType & USB_DIR_IN)
                        field |= TRB_DIR_IN;
                queue_trb(xhci, ep_ring, false,
                                lower_32_bits(urb->transfer_dma),
                                upper_32_bits(urb->transfer_dma),
-                               TRB_LEN(urb->transfer_buffer_length) | TRB_INTR_TARGET(0),
+                               length_field,
                                /* Event on short tx */
                                field | TRB_ISP | TRB_TYPE(TRB_DATA) | ep_ring->cycle_state);
        }
@@ -749,17 +1671,7 @@ int queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
                        /* Event on completion */
                        field | TRB_IOC | TRB_TYPE(TRB_STATUS) | ep_ring->cycle_state);
 
-       /*
-        * Pass all the TRBs to the hardware at once and make sure this write
-        * isn't reordered.
-        */
-       wmb();
-       start_trb->field[3] |= start_cycle;
-       field = xhci_readl(xhci, &xhci->dba->doorbell[slot_id]) & DB_MASK;
-       xhci_writel(xhci, field | EPI_TO_DB(ep_index), &xhci->dba->doorbell[slot_id]);
-       /* Flush PCI posted writes */
-       xhci_readl(xhci, &xhci->dba->doorbell[slot_id]);
-
+       giveback_first_trb(xhci, slot_id, ep_index, start_cycle, start_trb, td);
        return 0;
 }
 
@@ -788,24 +1700,80 @@ static int queue_cmd_noop(struct xhci_hcd *xhci)
  * Place a no-op command on the command ring to test the command and
  * event ring.
  */
-void *setup_one_noop(struct xhci_hcd *xhci)
+void *xhci_setup_one_noop(struct xhci_hcd *xhci)
 {
        if (queue_cmd_noop(xhci) < 0)
                return NULL;
        xhci->noops_submitted++;
-       return ring_cmd_db;
+       return xhci_ring_cmd_db;
 }
 
 /* Queue a slot enable or disable request on the command ring */
-int queue_slot_control(struct xhci_hcd *xhci, u32 trb_type, u32 slot_id)
+int xhci_queue_slot_control(struct xhci_hcd *xhci, u32 trb_type, u32 slot_id)
 {
        return queue_command(xhci, 0, 0, 0,
                        TRB_TYPE(trb_type) | SLOT_ID_FOR_TRB(slot_id));
 }
 
 /* Queue an address device command TRB */
-int queue_address_device(struct xhci_hcd *xhci, dma_addr_t in_ctx_ptr, u32 slot_id)
+int xhci_queue_address_device(struct xhci_hcd *xhci, dma_addr_t in_ctx_ptr,
+               u32 slot_id)
 {
-       return queue_command(xhci, in_ctx_ptr, 0, 0,
+       return queue_command(xhci, lower_32_bits(in_ctx_ptr),
+                       upper_32_bits(in_ctx_ptr), 0,
                        TRB_TYPE(TRB_ADDR_DEV) | SLOT_ID_FOR_TRB(slot_id));
 }
+
+/* Queue a configure endpoint command TRB */
+int xhci_queue_configure_endpoint(struct xhci_hcd *xhci, dma_addr_t in_ctx_ptr,
+               u32 slot_id)
+{
+       return queue_command(xhci, lower_32_bits(in_ctx_ptr),
+                       upper_32_bits(in_ctx_ptr), 0,
+                       TRB_TYPE(TRB_CONFIG_EP) | SLOT_ID_FOR_TRB(slot_id));
+}
+
+int xhci_queue_stop_endpoint(struct xhci_hcd *xhci, int slot_id,
+               unsigned int ep_index)
+{
+       u32 trb_slot_id = SLOT_ID_FOR_TRB(slot_id);
+       u32 trb_ep_index = EP_ID_FOR_TRB(ep_index);
+       u32 type = TRB_TYPE(TRB_STOP_RING);
+
+       return queue_command(xhci, 0, 0, 0,
+                       trb_slot_id | trb_ep_index | type);
+}
+
+/* Set Transfer Ring Dequeue Pointer command.
+ * This should not be used for endpoints that have streams enabled.
+ */
+static int queue_set_tr_deq(struct xhci_hcd *xhci, int slot_id,
+               unsigned int ep_index, struct xhci_segment *deq_seg,
+               union xhci_trb *deq_ptr, u32 cycle_state)
+{
+       dma_addr_t addr;
+       u32 trb_slot_id = SLOT_ID_FOR_TRB(slot_id);
+       u32 trb_ep_index = EP_ID_FOR_TRB(ep_index);
+       u32 type = TRB_TYPE(TRB_SET_DEQ);
+
+       addr = xhci_trb_virt_to_dma(deq_seg, deq_ptr);
+       if (addr == 0) {
+               xhci_warn(xhci, "WARN Cannot submit Set TR Deq Ptr\n");
+               xhci_warn(xhci, "WARN deq seg = %p, deq pt = %p\n",
+                               deq_seg, deq_ptr);
+               return 0;
+       }
+       return queue_command(xhci, lower_32_bits(addr) | cycle_state,
+                       upper_32_bits(addr), 0,
+                       trb_slot_id | trb_ep_index | type);
+}
+
+int xhci_queue_reset_ep(struct xhci_hcd *xhci, int slot_id,
+               unsigned int ep_index)
+{
+       u32 trb_slot_id = SLOT_ID_FOR_TRB(slot_id);
+       u32 trb_ep_index = EP_ID_FOR_TRB(ep_index);
+       u32 type = TRB_TYPE(TRB_RESET_EP);
+
+       return queue_command(xhci, 0, 0, 0, trb_slot_id | trb_ep_index | type);
+}