SAFE public projects git trees. - safe/jmp/linux-2.6/blob - drivers/net/cxgb3/sge.c

   1 /*
   2  * Copyright (c) 2005-2007 Chelsio, Inc. All rights reserved.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  */
  32 #include <linux/skbuff.h>
  33 #include <linux/netdevice.h>
  34 #include <linux/etherdevice.h>
  35 #include <linux/if_vlan.h>
  36 #include <linux/ip.h>
  37 #include <linux/tcp.h>
  38 #include <linux/dma-mapping.h>
  39 #include "common.h"
  40 #include "regs.h"
  41 #include "sge_defs.h"
  42 #include "t3_cpl.h"
  43 #include "firmware_exports.h"
  44
  45 #define USE_GTS 0
  46
  47 #define SGE_RX_SM_BUF_SIZE 1536
  48
  49 #define SGE_RX_COPY_THRES  256
  50 #define SGE_RX_PULL_LEN    128
  51
  52 /*
  53  * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
  54  * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
  55  * directly.
  56  */
  57 #define FL0_PG_CHUNK_SIZE  2048
  58
  59 #define SGE_RX_DROP_THRES 16
  60
  61 /*
  62  * Period of the Tx buffer reclaim timer.  This timer does not need to run
  63  * frequently as Tx buffers are usually reclaimed by new Tx packets.
  64  */
  65 #define TX_RECLAIM_PERIOD (HZ / 4)
  66
  67 /* WR size in bytes */
  68 #define WR_LEN (WR_FLITS * 8)
  69
  70 /*
  71  * Types of Tx queues in each queue set.  Order here matters, do not change.
  72  */
  73 enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
  74
  75 /* Values for sge_txq.flags */
  76 enum {
  77         TXQ_RUNNING = 1 << 0,   /* fetch engine is running */
  78         TXQ_LAST_PKT_DB = 1 << 1,       /* last packet rang the doorbell */
  79 };
  80
  81 struct tx_desc {
  82         u64 flit[TX_DESC_FLITS];
  83 };
  84
  85 struct rx_desc {
  86         __be32 addr_lo;
  87         __be32 len_gen;
  88         __be32 gen2;
  89         __be32 addr_hi;
  90 };
  91
  92 struct tx_sw_desc {             /* SW state per Tx descriptor */
  93         struct sk_buff *skb;
  94 };
  95
  96 struct rx_sw_desc {                /* SW state per Rx descriptor */
  97         union {
  98                 struct sk_buff *skb;
  99                 struct fl_pg_chunk pg_chunk;
 100         };
 101         DECLARE_PCI_UNMAP_ADDR(dma_addr);
 102 };
 103
 104 struct rsp_desc {               /* response queue descriptor */
 105         struct rss_header rss_hdr;
 106         __be32 flags;
 107         __be32 len_cq;
 108         u8 imm_data[47];
 109         u8 intr_gen;
 110 };
 111
 112 struct unmap_info {             /* packet unmapping info, overlays skb->cb */
 113         int sflit;              /* start flit of first SGL entry in Tx descriptor */
 114         u16 fragidx;            /* first page fragment in current Tx descriptor */
 115         u16 addr_idx;           /* buffer index of first SGL entry in descriptor */
 116         u32 len;                /* mapped length of skb main body */
 117 };
 118
 119 /*
 120  * Holds unmapping information for Tx packets that need deferred unmapping.
 121  * This structure lives at skb->head and must be allocated by callers.
 122  */
 123 struct deferred_unmap_info {
 124         struct pci_dev *pdev;
 125         dma_addr_t addr[MAX_SKB_FRAGS + 1];
 126 };
 127
 128 /*
 129  * Maps a number of flits to the number of Tx descriptors that can hold them.
 130  * The formula is
 131  *
 132  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
 133  *
 134  * HW allows up to 4 descriptors to be combined into a WR.
 135  */
 136 static u8 flit_desc_map[] = {
 137         0,
 138 #if SGE_NUM_GENBITS == 1
 139         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 140         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 141         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 142         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 143 #elif SGE_NUM_GENBITS == 2
 144         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 145         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 146         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 147         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 148 #else
 149 # error "SGE_NUM_GENBITS must be 1 or 2"
 150 #endif
 151 };
 152
 153 static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
 154 {
 155         return container_of(q, struct sge_qset, fl[qidx]);
 156 }
 157
 158 static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
 159 {
 160         return container_of(q, struct sge_qset, rspq);
 161 }
 162
 163 static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
 164 {
 165         return container_of(q, struct sge_qset, txq[qidx]);
 166 }
 167
 168 /**
 169  *      refill_rspq - replenish an SGE response queue
 170  *      @adapter: the adapter
 171  *      @q: the response queue to replenish
 172  *      @credits: how many new responses to make available
 173  *
 174  *      Replenishes a response queue by making the supplied number of responses
 175  *      available to HW.
 176  */
 177 static inline void refill_rspq(struct adapter *adapter,
 178                                const struct sge_rspq *q, unsigned int credits)
 179 {
 180         t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
 181                      V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
 182 }
 183
 184 /**
 185  *      need_skb_unmap - does the platform need unmapping of sk_buffs?
 186  *
 187  *      Returns true if the platfrom needs sk_buff unmapping.  The compiler
 188  *      optimizes away unecessary code if this returns true.
 189  */
 190 static inline int need_skb_unmap(void)
 191 {
 192         /*
 193          * This structure is used to tell if the platfrom needs buffer
 194          * unmapping by checking if DECLARE_PCI_UNMAP_ADDR defines anything.
 195          */
 196         struct dummy {
 197                 DECLARE_PCI_UNMAP_ADDR(addr);
 198         };
 199
 200         return sizeof(struct dummy) != 0;
 201 }
 202
 203 /**
 204  *      unmap_skb - unmap a packet main body and its page fragments
 205  *      @skb: the packet
 206  *      @q: the Tx queue containing Tx descriptors for the packet
 207  *      @cidx: index of Tx descriptor
 208  *      @pdev: the PCI device
 209  *
 210  *      Unmap the main body of an sk_buff and its page fragments, if any.
 211  *      Because of the fairly complicated structure of our SGLs and the desire
 212  *      to conserve space for metadata, we keep the information necessary to
 213  *      unmap an sk_buff partly in the sk_buff itself (in its cb), and partly
 214  *      in the Tx descriptors (the physical addresses of the various data
 215  *      buffers).  The send functions initialize the state in skb->cb so we
 216  *      can unmap the buffers held in the first Tx descriptor here, and we
 217  *      have enough information at this point to update the state for the next
 218  *      Tx descriptor.
 219  */
 220 static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
 221                              unsigned int cidx, struct pci_dev *pdev)
 222 {
 223         const struct sg_ent *sgp;
 224         struct unmap_info *ui = (struct unmap_info *)skb->cb;
 225         int nfrags, frag_idx, curflit, j = ui->addr_idx;
 226
 227         sgp = (struct sg_ent *)&q->desc[cidx].flit[ui->sflit];
 228
 229         if (ui->len) {
 230                 pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]), ui->len,
 231                                  PCI_DMA_TODEVICE);
 232                 ui->len = 0;    /* so we know for next descriptor for this skb */
 233                 j = 1;
 234         }
 235
 236         frag_idx = ui->fragidx;
 237         curflit = ui->sflit + 1 + j;
 238         nfrags = skb_shinfo(skb)->nr_frags;
 239
 240         while (frag_idx < nfrags && curflit < WR_FLITS) {
 241                 pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]),
 242                                skb_shinfo(skb)->frags[frag_idx].size,
 243                                PCI_DMA_TODEVICE);
 244                 j ^= 1;
 245                 if (j == 0) {
 246                         sgp++;
 247                         curflit++;
 248                 }
 249                 curflit++;
 250                 frag_idx++;
 251         }
 252
 253         if (frag_idx < nfrags) {        /* SGL continues into next Tx descriptor */
 254                 ui->fragidx = frag_idx;
 255                 ui->addr_idx = j;
 256                 ui->sflit = curflit - WR_FLITS - j;     /* sflit can be -1 */
 257         }
 258 }
 259
 260 /**
 261  *      free_tx_desc - reclaims Tx descriptors and their buffers
 262  *      @adapter: the adapter
 263  *      @q: the Tx queue to reclaim descriptors from
 264  *      @n: the number of descriptors to reclaim
 265  *
 266  *      Reclaims Tx descriptors from an SGE Tx queue and frees the associated
 267  *      Tx buffers.  Called with the Tx queue lock held.
 268  */
 269 static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
 270                          unsigned int n)
 271 {
 272         struct tx_sw_desc *d;
 273         struct pci_dev *pdev = adapter->pdev;
 274         unsigned int cidx = q->cidx;
 275
 276         const int need_unmap = need_skb_unmap() &&
 277                                q->cntxt_id >= FW_TUNNEL_SGEEC_START;
 278
 279         d = &q->sdesc[cidx];
 280         while (n--) {
 281                 if (d->skb) {   /* an SGL is present */
 282                         if (need_unmap)
 283                                 unmap_skb(d->skb, q, cidx, pdev);
 284                         if (d->skb->priority == cidx)
 285                                 kfree_skb(d->skb);
 286                 }
 287                 ++d;
 288                 if (++cidx == q->size) {
 289                         cidx = 0;
 290                         d = q->sdesc;
 291                 }
 292         }
 293         q->cidx = cidx;
 294 }
 295
 296 /**
 297  *      reclaim_completed_tx - reclaims completed Tx descriptors
 298  *      @adapter: the adapter
 299  *      @q: the Tx queue to reclaim completed descriptors from
 300  *
 301  *      Reclaims Tx descriptors that the SGE has indicated it has processed,
 302  *      and frees the associated buffers if possible.  Called with the Tx
 303  *      queue's lock held.
 304  */
 305 static inline void reclaim_completed_tx(struct adapter *adapter,
 306                                         struct sge_txq *q)
 307 {
 308         unsigned int reclaim = q->processed - q->cleaned;
 309
 310         if (reclaim) {
 311                 free_tx_desc(adapter, q, reclaim);
 312                 q->cleaned += reclaim;
 313                 q->in_use -= reclaim;
 314         }
 315 }
 316
 317 /**
 318  *      should_restart_tx - are there enough resources to restart a Tx queue?
 319  *      @q: the Tx queue
 320  *
 321  *      Checks if there are enough descriptors to restart a suspended Tx queue.
 322  */
 323 static inline int should_restart_tx(const struct sge_txq *q)
 324 {
 325         unsigned int r = q->processed - q->cleaned;
 326
 327         return q->in_use - r < (q->size >> 1);
 328 }
 329
 330 /**
 331  *      free_rx_bufs - free the Rx buffers on an SGE free list
 332  *      @pdev: the PCI device associated with the adapter
 333  *      @rxq: the SGE free list to clean up
 334  *
 335  *      Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
 336  *      this queue should be stopped before calling this function.
 337  */
 338 static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
 339 {
 340         unsigned int cidx = q->cidx;
 341
 342         while (q->credits--) {
 343                 struct rx_sw_desc *d = &q->sdesc[cidx];
 344
 345                 pci_unmap_single(pdev, pci_unmap_addr(d, dma_addr),
 346                                  q->buf_size, PCI_DMA_FROMDEVICE);
 347                 if (q->use_pages) {
 348                         put_page(d->pg_chunk.page);
 349                         d->pg_chunk.page = NULL;
 350                 } else {
 351                         kfree_skb(d->skb);
 352                         d->skb = NULL;
 353                 }
 354                 if (++cidx == q->size)
 355                         cidx = 0;
 356         }
 357
 358         if (q->pg_chunk.page) {
 359                 __free_page(q->pg_chunk.page);
 360                 q->pg_chunk.page = NULL;
 361         }
 362 }
 363
 364 /**
 365  *      add_one_rx_buf - add a packet buffer to a free-buffer list
 366  *      @va:  buffer start VA
 367  *      @len: the buffer length
 368  *      @d: the HW Rx descriptor to write
 369  *      @sd: the SW Rx descriptor to write
 370  *      @gen: the generation bit value
 371  *      @pdev: the PCI device associated with the adapter
 372  *
 373  *      Add a buffer of the given length to the supplied HW and SW Rx
 374  *      descriptors.
 375  */
 376 static inline void add_one_rx_buf(void *va, unsigned int len,
 377                                   struct rx_desc *d, struct rx_sw_desc *sd,
 378                                   unsigned int gen, struct pci_dev *pdev)
 379 {
 380         dma_addr_t mapping;
 381
 382         mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
 383         pci_unmap_addr_set(sd, dma_addr, mapping);
 384
 385         d->addr_lo = cpu_to_be32(mapping);
 386         d->addr_hi = cpu_to_be32((u64) mapping >> 32);
 387         wmb();
 388         d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
 389         d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
 390 }
 391
 392 static int alloc_pg_chunk(struct sge_fl *q, struct rx_sw_desc *sd, gfp_t gfp)
 393 {
 394         if (!q->pg_chunk.page) {
 395                 q->pg_chunk.page = alloc_page(gfp);
 396                 if (unlikely(!q->pg_chunk.page))
 397                         return -ENOMEM;
 398                 q->pg_chunk.va = page_address(q->pg_chunk.page);
 399                 q->pg_chunk.offset = 0;
 400         }
 401         sd->pg_chunk = q->pg_chunk;
 402
 403         q->pg_chunk.offset += q->buf_size;
 404         if (q->pg_chunk.offset == PAGE_SIZE)
 405                 q->pg_chunk.page = NULL;
 406         else {
 407                 q->pg_chunk.va += q->buf_size;
 408                 get_page(q->pg_chunk.page);
 409         }
 410         return 0;
 411 }
 412
 413 /**
 414  *      refill_fl - refill an SGE free-buffer list
 415  *      @adapter: the adapter
 416  *      @q: the free-list to refill
 417  *      @n: the number of new buffers to allocate
 418  *      @gfp: the gfp flags for allocating new buffers
 419  *
 420  *      (Re)populate an SGE free-buffer list with up to @n new packet buffers,
 421  *      allocated with the supplied gfp flags.  The caller must assure that
 422  *      @n does not exceed the queue's capacity.
 423  */
 424 static void refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
 425 {
 426         void *buf_start;
 427         struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 428         struct rx_desc *d = &q->desc[q->pidx];
 429
 430         while (n--) {
 431                 if (q->use_pages) {
 432                         if (unlikely(alloc_pg_chunk(q, sd, gfp))) {
 433 nomem:                          q->alloc_failed++;
 434                                 break;
 435                         }
 436                         buf_start = sd->pg_chunk.va;
 437                 } else {
 438                         struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
 439
 440                         if (!skb)
 441                                 goto nomem;
 442
 443                         sd->skb = skb;
 444                         buf_start = skb->data;
 445                 }
 446
 447                 add_one_rx_buf(buf_start, q->buf_size, d, sd, q->gen,
 448                                adap->pdev);
 449                 d++;
 450                 sd++;
 451                 if (++q->pidx == q->size) {
 452                         q->pidx = 0;
 453                         q->gen ^= 1;
 454                         sd = q->sdesc;
 455                         d = q->desc;
 456                 }
 457                 q->credits++;
 458         }
 459
 460         t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 461 }
 462
 463 static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
 464 {
 465         refill_fl(adap, fl, min(16U, fl->size - fl->credits), GFP_ATOMIC);
 466 }
 467
 468 /**
 469  *      recycle_rx_buf - recycle a receive buffer
 470  *      @adapter: the adapter
 471  *      @q: the SGE free list
 472  *      @idx: index of buffer to recycle
 473  *
 474  *      Recycles the specified buffer on the given free list by adding it at
 475  *      the next available slot on the list.
 476  */
 477 static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
 478                            unsigned int idx)
 479 {
 480         struct rx_desc *from = &q->desc[idx];
 481         struct rx_desc *to = &q->desc[q->pidx];
 482
 483         q->sdesc[q->pidx] = q->sdesc[idx];
 484         to->addr_lo = from->addr_lo;    /* already big endian */
 485         to->addr_hi = from->addr_hi;    /* likewise */
 486         wmb();
 487         to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
 488         to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
 489         q->credits++;
 490
 491         if (++q->pidx == q->size) {
 492                 q->pidx = 0;
 493                 q->gen ^= 1;
 494         }
 495         t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 496 }
 497
 498 /**
 499  *      alloc_ring - allocate resources for an SGE descriptor ring
 500  *      @pdev: the PCI device
 501  *      @nelem: the number of descriptors
 502  *      @elem_size: the size of each descriptor
 503  *      @sw_size: the size of the SW state associated with each ring element
 504  *      @phys: the physical address of the allocated ring
 505  *      @metadata: address of the array holding the SW state for the ring
 506  *
 507  *      Allocates resources for an SGE descriptor ring, such as Tx queues,
 508  *      free buffer lists, or response queues.  Each SGE ring requires
 509  *      space for its HW descriptors plus, optionally, space for the SW state
 510  *      associated with each HW entry (the metadata).  The function returns
 511  *      three values: the virtual address for the HW ring (the return value
 512  *      of the function), the physical address of the HW ring, and the address
 513  *      of the SW ring.
 514  */
 515 static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
 516                         size_t sw_size, dma_addr_t * phys, void *metadata)
 517 {
 518         size_t len = nelem * elem_size;
 519         void *s = NULL;
 520         void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
 521
 522         if (!p)
 523                 return NULL;
 524         if (sw_size) {
 525                 s = kcalloc(nelem, sw_size, GFP_KERNEL);
 526
 527                 if (!s) {
 528                         dma_free_coherent(&pdev->dev, len, p, *phys);
 529                         return NULL;
 530                 }
 531         }
 532         if (metadata)
 533                 *(void **)metadata = s;
 534         memset(p, 0, len);
 535         return p;
 536 }
 537
 538 /**
 539  *      free_qset - free the resources of an SGE queue set
 540  *      @adapter: the adapter owning the queue set
 541  *      @q: the queue set
 542  *
 543  *      Release the HW and SW resources associated with an SGE queue set, such
 544  *      as HW contexts, packet buffers, and descriptor rings.  Traffic to the
 545  *      queue set must be quiesced prior to calling this.
 546  */
 547 void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
 548 {
 549         int i;
 550         struct pci_dev *pdev = adapter->pdev;
 551
 552         if (q->tx_reclaim_timer.function)
 553                 del_timer_sync(&q->tx_reclaim_timer);
 554
 555         for (i = 0; i < SGE_RXQ_PER_SET; ++i)
 556                 if (q->fl[i].desc) {
 557                         spin_lock(&adapter->sge.reg_lock);
 558                         t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
 559                         spin_unlock(&adapter->sge.reg_lock);
 560                         free_rx_bufs(pdev, &q->fl[i]);
 561                         kfree(q->fl[i].sdesc);
 562                         dma_free_coherent(&pdev->dev,
 563                                           q->fl[i].size *
 564                                           sizeof(struct rx_desc), q->fl[i].desc,
 565                                           q->fl[i].phys_addr);
 566                 }
 567
 568         for (i = 0; i < SGE_TXQ_PER_SET; ++i)
 569                 if (q->txq[i].desc) {
 570                         spin_lock(&adapter->sge.reg_lock);
 571                         t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
 572                         spin_unlock(&adapter->sge.reg_lock);
 573                         if (q->txq[i].sdesc) {
 574                                 free_tx_desc(adapter, &q->txq[i],
 575                                              q->txq[i].in_use);
 576                                 kfree(q->txq[i].sdesc);
 577                         }
 578                         dma_free_coherent(&pdev->dev,
 579                                           q->txq[i].size *
 580                                           sizeof(struct tx_desc),
 581                                           q->txq[i].desc, q->txq[i].phys_addr);
 582                         __skb_queue_purge(&q->txq[i].sendq);
 583                 }
 584
 585         if (q->rspq.desc) {
 586                 spin_lock(&adapter->sge.reg_lock);
 587                 t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
 588                 spin_unlock(&adapter->sge.reg_lock);
 589                 dma_free_coherent(&pdev->dev,
 590                                   q->rspq.size * sizeof(struct rsp_desc),
 591                                   q->rspq.desc, q->rspq.phys_addr);
 592         }
 593
 594         memset(q, 0, sizeof(*q));
 595 }
 596
 597 /**
 598  *      init_qset_cntxt - initialize an SGE queue set context info
 599  *      @qs: the queue set
 600  *      @id: the queue set id
 601  *
 602  *      Initializes the TIDs and context ids for the queues of a queue set.
 603  */
 604 static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
 605 {
 606         qs->rspq.cntxt_id = id;
 607         qs->fl[0].cntxt_id = 2 * id;
 608         qs->fl[1].cntxt_id = 2 * id + 1;
 609         qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
 610         qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
 611         qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
 612         qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
 613         qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
 614 }
 615
 616 /**
 617  *      sgl_len - calculates the size of an SGL of the given capacity
 618  *      @n: the number of SGL entries
 619  *
 620  *      Calculates the number of flits needed for a scatter/gather list that
 621  *      can hold the given number of entries.
 622  */
 623 static inline unsigned int sgl_len(unsigned int n)
 624 {
 625         /* alternatively: 3 * (n / 2) + 2 * (n & 1) */
 626         return (3 * n) / 2 + (n & 1);
 627 }
 628
 629 /**
 630  *      flits_to_desc - returns the num of Tx descriptors for the given flits
 631  *      @n: the number of flits
 632  *
 633  *      Calculates the number of Tx descriptors needed for the supplied number
 634  *      of flits.
 635  */
 636 static inline unsigned int flits_to_desc(unsigned int n)
 637 {
 638         BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
 639         return flit_desc_map[n];
 640 }
 641
 642 /**
 643  *      get_packet - return the next ingress packet buffer from a free list
 644  *      @adap: the adapter that received the packet
 645  *      @fl: the SGE free list holding the packet
 646  *      @len: the packet length including any SGE padding
 647  *      @drop_thres: # of remaining buffers before we start dropping packets
 648  *
 649  *      Get the next packet from a free list and complete setup of the
 650  *      sk_buff.  If the packet is small we make a copy and recycle the
 651  *      original buffer, otherwise we use the original buffer itself.  If a
 652  *      positive drop threshold is supplied packets are dropped and their
 653  *      buffers recycled if (a) the number of remaining buffers is under the
 654  *      threshold and the packet is too big to copy, or (b) the packet should
 655  *      be copied but there is no memory for the copy.
 656  */
 657 static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
 658                                   unsigned int len, unsigned int drop_thres)
 659 {
 660         struct sk_buff *skb = NULL;
 661         struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 662
 663         prefetch(sd->skb->data);
 664         fl->credits--;
 665
 666         if (len <= SGE_RX_COPY_THRES) {
 667                 skb = alloc_skb(len, GFP_ATOMIC);
 668                 if (likely(skb != NULL)) {
 669                         __skb_put(skb, len);
 670                         pci_dma_sync_single_for_cpu(adap->pdev,
 671                                             pci_unmap_addr(sd, dma_addr), len,
 672                                             PCI_DMA_FROMDEVICE);
 673                         memcpy(skb->data, sd->skb->data, len);
 674                         pci_dma_sync_single_for_device(adap->pdev,
 675                                             pci_unmap_addr(sd, dma_addr), len,
 676                                             PCI_DMA_FROMDEVICE);
 677                 } else if (!drop_thres)
 678                         goto use_orig_buf;
 679 recycle:
 680                 recycle_rx_buf(adap, fl, fl->cidx);
 681                 return skb;
 682         }
 683
 684         if (unlikely(fl->credits < drop_thres))
 685                 goto recycle;
 686
 687 use_orig_buf:
 688         pci_unmap_single(adap->pdev, pci_unmap_addr(sd, dma_addr),
 689                          fl->buf_size, PCI_DMA_FROMDEVICE);
 690         skb = sd->skb;
 691         skb_put(skb, len);
 692         __refill_fl(adap, fl);
 693         return skb;
 694 }
 695
 696 /**
 697  *      get_packet_pg - return the next ingress packet buffer from a free list
 698  *      @adap: the adapter that received the packet
 699  *      @fl: the SGE free list holding the packet
 700  *      @len: the packet length including any SGE padding
 701  *      @drop_thres: # of remaining buffers before we start dropping packets
 702  *
 703  *      Get the next packet from a free list populated with page chunks.
 704  *      If the packet is small we make a copy and recycle the original buffer,
 705  *      otherwise we attach the original buffer as a page fragment to a fresh
 706  *      sk_buff.  If a positive drop threshold is supplied packets are dropped
 707  *      and their buffers recycled if (a) the number of remaining buffers is
 708  *      under the threshold and the packet is too big to copy, or (b) there's
 709  *      no system memory.
 710  *
 711  *      Note: this function is similar to @get_packet but deals with Rx buffers
 712  *      that are page chunks rather than sk_buffs.
 713  */
 714 static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
 715                                      unsigned int len, unsigned int drop_thres)
 716 {
 717         struct sk_buff *skb = NULL;
 718         struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 719
 720         if (len <= SGE_RX_COPY_THRES) {
 721                 skb = alloc_skb(len, GFP_ATOMIC);
 722                 if (likely(skb != NULL)) {
 723                         __skb_put(skb, len);
 724                         pci_dma_sync_single_for_cpu(adap->pdev,
 725                                             pci_unmap_addr(sd, dma_addr), len,
 726                                             PCI_DMA_FROMDEVICE);
 727                         memcpy(skb->data, sd->pg_chunk.va, len);
 728                         pci_dma_sync_single_for_device(adap->pdev,
 729                                             pci_unmap_addr(sd, dma_addr), len,
 730                                             PCI_DMA_FROMDEVICE);
 731                 } else if (!drop_thres)
 732                         return NULL;
 733 recycle:
 734                 fl->credits--;
 735                 recycle_rx_buf(adap, fl, fl->cidx);
 736                 return skb;
 737         }
 738
 739         if (unlikely(fl->credits <= drop_thres))
 740                 goto recycle;
 741
 742         skb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
 743         if (unlikely(!skb)) {
 744                 if (!drop_thres)
 745                         return NULL;
 746                 goto recycle;
 747         }
 748
 749         pci_unmap_single(adap->pdev, pci_unmap_addr(sd, dma_addr),
 750                          fl->buf_size, PCI_DMA_FROMDEVICE);
 751         __skb_put(skb, SGE_RX_PULL_LEN);
 752         memcpy(skb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
 753         skb_fill_page_desc(skb, 0, sd->pg_chunk.page,
 754                            sd->pg_chunk.offset + SGE_RX_PULL_LEN,
 755                            len - SGE_RX_PULL_LEN);
 756         skb->len = len;
 757         skb->data_len = len - SGE_RX_PULL_LEN;
 758         skb->truesize += skb->data_len;
 759
 760         fl->credits--;
 761         /*
 762          * We do not refill FLs here, we let the caller do it to overlap a
 763          * prefetch.
 764          */
 765         return skb;
 766 }
 767
 768 /**
 769  *      get_imm_packet - return the next ingress packet buffer from a response
 770  *      @resp: the response descriptor containing the packet data
 771  *
 772  *      Return a packet containing the immediate data of the given response.
 773  */
 774 static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
 775 {
 776         struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
 777
 778         if (skb) {
 779                 __skb_put(skb, IMMED_PKT_SIZE);
 780                 skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE);
 781         }
 782         return skb;
 783 }
 784
 785 /**
 786  *      calc_tx_descs - calculate the number of Tx descriptors for a packet
 787  *      @skb: the packet
 788  *
 789  *      Returns the number of Tx descriptors needed for the given Ethernet
 790  *      packet.  Ethernet packets require addition of WR and CPL headers.
 791  */
 792 static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
 793 {
 794         unsigned int flits;
 795
 796         if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
 797                 return 1;
 798
 799         flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
 800         if (skb_shinfo(skb)->gso_size)
 801                 flits++;
 802         return flits_to_desc(flits);
 803 }
 804
 805 /**
 806  *      make_sgl - populate a scatter/gather list for a packet
 807  *      @skb: the packet
 808  *      @sgp: the SGL to populate
 809  *      @start: start address of skb main body data to include in the SGL
 810  *      @len: length of skb main body data to include in the SGL
 811  *      @pdev: the PCI device
 812  *
 813  *      Generates a scatter/gather list for the buffers that make up a packet
 814  *      and returns the SGL size in 8-byte words.  The caller must size the SGL
 815  *      appropriately.
 816  */
 817 static inline unsigned int make_sgl(const struct sk_buff *skb,
 818                                     struct sg_ent *sgp, unsigned char *start,
 819                                     unsigned int len, struct pci_dev *pdev)
 820 {
 821         dma_addr_t mapping;
 822         unsigned int i, j = 0, nfrags;
 823
 824         if (len) {
 825                 mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE);
 826                 sgp->len[0] = cpu_to_be32(len);
 827                 sgp->addr[0] = cpu_to_be64(mapping);
 828                 j = 1;
 829         }
 830
 831         nfrags = skb_shinfo(skb)->nr_frags;
 832         for (i = 0; i < nfrags; i++) {
 833                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 834
 835                 mapping = pci_map_page(pdev, frag->page, frag->page_offset,
 836                                        frag->size, PCI_DMA_TODEVICE);
 837                 sgp->len[j] = cpu_to_be32(frag->size);
 838                 sgp->addr[j] = cpu_to_be64(mapping);
 839                 j ^= 1;
 840                 if (j == 0)
 841                         ++sgp;
 842         }
 843         if (j)
 844                 sgp->len[j] = 0;
 845         return ((nfrags + (len != 0)) * 3) / 2 + j;
 846 }
 847
 848 /**
 849  *      check_ring_tx_db - check and potentially ring a Tx queue's doorbell
 850  *      @adap: the adapter
 851  *      @q: the Tx queue
 852  *
 853  *      Ring the doorbel if a Tx queue is asleep.  There is a natural race,
 854  *      where the HW is going to sleep just after we checked, however,
 855  *      then the interrupt handler will detect the outstanding TX packet
 856  *      and ring the doorbell for us.
 857  *
 858  *      When GTS is disabled we unconditionally ring the doorbell.
 859  */
 860 static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
 861 {
 862 #if USE_GTS
 863         clear_bit(TXQ_LAST_PKT_DB, &q->flags);
 864         if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
 865                 set_bit(TXQ_LAST_PKT_DB, &q->flags);
 866                 t3_write_reg(adap, A_SG_KDOORBELL,
 867                              F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 868         }
 869 #else
 870         wmb();                  /* write descriptors before telling HW */
 871         t3_write_reg(adap, A_SG_KDOORBELL,
 872                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 873 #endif
 874 }
 875
 876 static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
 877 {
 878 #if SGE_NUM_GENBITS == 2
 879         d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
 880 #endif
 881 }
 882
 883 /**
 884  *      write_wr_hdr_sgl - write a WR header and, optionally, SGL
 885  *      @ndesc: number of Tx descriptors spanned by the SGL
 886  *      @skb: the packet corresponding to the WR
 887  *      @d: first Tx descriptor to be written
 888  *      @pidx: index of above descriptors
 889  *      @q: the SGE Tx queue
 890  *      @sgl: the SGL
 891  *      @flits: number of flits to the start of the SGL in the first descriptor
 892  *      @sgl_flits: the SGL size in flits
 893  *      @gen: the Tx descriptor generation
 894  *      @wr_hi: top 32 bits of WR header based on WR type (big endian)
 895  *      @wr_lo: low 32 bits of WR header based on WR type (big endian)
 896  *
 897  *      Write a work request header and an associated SGL.  If the SGL is
 898  *      small enough to fit into one Tx descriptor it has already been written
 899  *      and we just need to write the WR header.  Otherwise we distribute the
 900  *      SGL across the number of descriptors it spans.
 901  */
 902 static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
 903                              struct tx_desc *d, unsigned int pidx,
 904                              const struct sge_txq *q,
 905                              const struct sg_ent *sgl,
 906                              unsigned int flits, unsigned int sgl_flits,
 907                              unsigned int gen, unsigned int wr_hi,
 908                              unsigned int wr_lo)
 909 {
 910         struct work_request_hdr *wrp = (struct work_request_hdr *)d;
 911         struct tx_sw_desc *sd = &q->sdesc[pidx];
 912
 913         sd->skb = skb;
 914         if (need_skb_unmap()) {
 915                 struct unmap_info *ui = (struct unmap_info *)skb->cb;
 916
 917                 ui->fragidx = 0;
 918                 ui->addr_idx = 0;
 919                 ui->sflit = flits;
 920         }
 921
 922         if (likely(ndesc == 1)) {
 923                 skb->priority = pidx;
 924                 wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 925                                    V_WR_SGLSFLT(flits)) | wr_hi;
 926                 wmb();
 927                 wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
 928                                    V_WR_GEN(gen)) | wr_lo;
 929                 wr_gen2(d, gen);
 930         } else {
 931                 unsigned int ogen = gen;
 932                 const u64 *fp = (const u64 *)sgl;
 933                 struct work_request_hdr *wp = wrp;
 934
 935                 wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
 936                                    V_WR_SGLSFLT(flits)) | wr_hi;
 937
 938                 while (sgl_flits) {
 939                         unsigned int avail = WR_FLITS - flits;
 940
 941                         if (avail > sgl_flits)
 942                                 avail = sgl_flits;
 943                         memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
 944                         sgl_flits -= avail;
 945                         ndesc--;
 946                         if (!sgl_flits)
 947                                 break;
 948
 949                         fp += avail;
 950                         d++;
 951                         sd++;
 952                         if (++pidx == q->size) {
 953                                 pidx = 0;
 954                                 gen ^= 1;
 955                                 d = q->desc;
 956                                 sd = q->sdesc;
 957                         }
 958
 959                         sd->skb = skb;
 960                         wrp = (struct work_request_hdr *)d;
 961                         wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
 962                                            V_WR_SGLSFLT(1)) | wr_hi;
 963                         wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
 964                                                         sgl_flits + 1)) |
 965                                            V_WR_GEN(gen)) | wr_lo;
 966                         wr_gen2(d, gen);
 967                         flits = 1;
 968                 }
 969                 skb->priority = pidx;
 970                 wrp->wr_hi |= htonl(F_WR_EOP);
 971                 wmb();
 972                 wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
 973                 wr_gen2((struct tx_desc *)wp, ogen);
 974                 WARN_ON(ndesc != 0);
 975         }
 976 }
 977
 978 /**
 979  *      write_tx_pkt_wr - write a TX_PKT work request
 980  *      @adap: the adapter
 981  *      @skb: the packet to send
 982  *      @pi: the egress interface
 983  *      @pidx: index of the first Tx descriptor to write
 984  *      @gen: the generation value to use
 985  *      @q: the Tx queue
 986  *      @ndesc: number of descriptors the packet will occupy
 987  *      @compl: the value of the COMPL bit to use
 988  *
 989  *      Generate a TX_PKT work request to send the supplied packet.
 990  */
 991 static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
 992                             const struct port_info *pi,
 993                             unsigned int pidx, unsigned int gen,
 994                             struct sge_txq *q, unsigned int ndesc,
 995                             unsigned int compl)
 996 {
 997         unsigned int flits, sgl_flits, cntrl, tso_info;
 998         struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
 999         struct tx_desc *d = &q->desc[pidx];
1000         struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
1001
1002         cpl->len = htonl(skb->len | 0x80000000);
1003         cntrl = V_TXPKT_INTF(pi->port_id);
1004
1005         if (vlan_tx_tag_present(skb) && pi->vlan_grp)
1006                 cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
1007
1008         tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
1009         if (tso_info) {
1010                 int eth_type;
1011                 struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
1012
1013                 d->flit[2] = 0;
1014                 cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1015                 hdr->cntrl = htonl(cntrl);
1016                 eth_type = skb_network_offset(skb) == ETH_HLEN ?
1017                     CPL_ETH_II : CPL_ETH_II_VLAN;
1018                 tso_info |= V_LSO_ETH_TYPE(eth_type) |
1019                     V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
1020                     V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
1021                 hdr->lso_info = htonl(tso_info);
1022                 flits = 3;
1023         } else {
1024                 cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1025                 cntrl |= F_TXPKT_IPCSUM_DIS;    /* SW calculates IP csum */
1026                 cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
1027                 cpl->cntrl = htonl(cntrl);
1028
1029                 if (skb->len <= WR_LEN - sizeof(*cpl)) {
1030                         q->sdesc[pidx].skb = NULL;
1031                         if (!skb->data_len)
1032                                 skb_copy_from_linear_data(skb, &d->flit[2],
1033                                                           skb->len);
1034                         else
1035                                 skb_copy_bits(skb, 0, &d->flit[2], skb->len);
1036
1037                         flits = (skb->len + 7) / 8 + 2;
1038                         cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
1039                                               V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
1040                                               | F_WR_SOP | F_WR_EOP | compl);
1041                         wmb();
1042                         cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
1043                                               V_WR_TID(q->token));
1044                         wr_gen2(d, gen);
1045                         kfree_skb(skb);
1046                         return;
1047                 }
1048
1049                 flits = 2;
1050         }
1051
1052         sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1053         sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev);
1054         if (need_skb_unmap())
1055                 ((struct unmap_info *)skb->cb)->len = skb_headlen(skb);
1056
1057         write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
1058                          htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
1059                          htonl(V_WR_TID(q->token)));
1060 }
1061
1062 /**
1063  *      eth_xmit - add a packet to the Ethernet Tx queue
1064  *      @skb: the packet
1065  *      @dev: the egress net device
1066  *
1067  *      Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
1068  */
1069 int t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1070 {
1071         unsigned int ndesc, pidx, credits, gen, compl;
1072         const struct port_info *pi = netdev_priv(dev);
1073         struct adapter *adap = pi->adapter;
1074         struct sge_qset *qs = pi->qs;
1075         struct sge_txq *q = &qs->txq[TXQ_ETH];
1076
1077         /*
1078          * The chip min packet length is 9 octets but play safe and reject
1079          * anything shorter than an Ethernet header.
1080          */
1081         if (unlikely(skb->len < ETH_HLEN)) {
1082                 dev_kfree_skb(skb);
1083                 return NETDEV_TX_OK;
1084         }
1085
1086         spin_lock(&q->lock);
1087         reclaim_completed_tx(adap, q);
1088
1089         credits = q->size - q->in_use;
1090         ndesc = calc_tx_descs(skb);
1091
1092         if (unlikely(credits < ndesc)) {
1093                 if (!netif_queue_stopped(dev)) {
1094                         netif_stop_queue(dev);
1095                         set_bit(TXQ_ETH, &qs->txq_stopped);
1096                         q->stops++;
1097                         dev_err(&adap->pdev->dev,
1098                                 "%s: Tx ring %u full while queue awake!\n",
1099                                 dev->name, q->cntxt_id & 7);
1100                 }
1101                 spin_unlock(&q->lock);
1102                 return NETDEV_TX_BUSY;
1103         }
1104
1105         q->in_use += ndesc;
1106         if (unlikely(credits - ndesc < q->stop_thres)) {
1107                 q->stops++;
1108                 netif_stop_queue(dev);
1109                 set_bit(TXQ_ETH, &qs->txq_stopped);
1110 #if !USE_GTS
1111                 if (should_restart_tx(q) &&
1112                     test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1113                         q->restarts++;
1114                         netif_wake_queue(dev);
1115                 }
1116 #endif
1117         }
1118
1119         gen = q->gen;
1120         q->unacked += ndesc;
1121         compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1122         q->unacked &= 7;
1123         pidx = q->pidx;
1124         q->pidx += ndesc;
1125         if (q->pidx >= q->size) {
1126                 q->pidx -= q->size;
1127                 q->gen ^= 1;
1128         }
1129
1130         /* update port statistics */
1131         if (skb->ip_summed == CHECKSUM_COMPLETE)
1132                 qs->port_stats[SGE_PSTAT_TX_CSUM]++;
1133         if (skb_shinfo(skb)->gso_size)
1134                 qs->port_stats[SGE_PSTAT_TSO]++;
1135         if (vlan_tx_tag_present(skb) && pi->vlan_grp)
1136                 qs->port_stats[SGE_PSTAT_VLANINS]++;
1137
1138         dev->trans_start = jiffies;
1139         spin_unlock(&q->lock);
1140
1141         /*
1142          * We do not use Tx completion interrupts to free DMAd Tx packets.
1143          * This is good for performamce but means that we rely on new Tx
1144          * packets arriving to run the destructors of completed packets,
1145          * which open up space in their sockets' send queues.  Sometimes
1146          * we do not get such new packets causing Tx to stall.  A single
1147          * UDP transmitter is a good example of this situation.  We have
1148          * a clean up timer that periodically reclaims completed packets
1149          * but it doesn't run often enough (nor do we want it to) to prevent
1150          * lengthy stalls.  A solution to this problem is to run the
1151          * destructor early, after the packet is queued but before it's DMAd.
1152          * A cons is that we lie to socket memory accounting, but the amount
1153          * of extra memory is reasonable (limited by the number of Tx
1154          * descriptors), the packets do actually get freed quickly by new
1155          * packets almost always, and for protocols like TCP that wait for
1156          * acks to really free up the data the extra memory is even less.
1157          * On the positive side we run the destructors on the sending CPU
1158          * rather than on a potentially different completing CPU, usually a
1159          * good thing.  We also run them without holding our Tx queue lock,
1160          * unlike what reclaim_completed_tx() would otherwise do.
1161          *
1162          * Run the destructor before telling the DMA engine about the packet
1163          * to make sure it doesn't complete and get freed prematurely.
1164          */
1165         if (likely(!skb_shared(skb)))
1166                 skb_orphan(skb);
1167
1168         write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl);
1169         check_ring_tx_db(adap, q);
1170         return NETDEV_TX_OK;
1171 }
1172
1173 /**
1174  *      write_imm - write a packet into a Tx descriptor as immediate data
1175  *      @d: the Tx descriptor to write
1176  *      @skb: the packet
1177  *      @len: the length of packet data to write as immediate data
1178  *      @gen: the generation bit value to write
1179  *
1180  *      Writes a packet as immediate data into a Tx descriptor.  The packet
1181  *      contains a work request at its beginning.  We must write the packet
1182  *      carefully so the SGE doesn't read accidentally before it's written in
1183  *      its entirety.
1184  */
1185 static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
1186                              unsigned int len, unsigned int gen)
1187 {
1188         struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
1189         struct work_request_hdr *to = (struct work_request_hdr *)d;
1190
1191         memcpy(&to[1], &from[1], len - sizeof(*from));
1192         to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1193                                         V_WR_BCNTLFLT(len & 7));
1194         wmb();
1195         to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1196                                         V_WR_LEN((len + 7) / 8));
1197         wr_gen2(d, gen);
1198         kfree_skb(skb);
1199 }
1200
1201 /**
1202  *      check_desc_avail - check descriptor availability on a send queue
1203  *      @adap: the adapter
1204  *      @q: the send queue
1205  *      @skb: the packet needing the descriptors
1206  *      @ndesc: the number of Tx descriptors needed
1207  *      @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1208  *
1209  *      Checks if the requested number of Tx descriptors is available on an
1210  *      SGE send queue.  If the queue is already suspended or not enough
1211  *      descriptors are available the packet is queued for later transmission.
1212  *      Must be called with the Tx queue locked.
1213  *
1214  *      Returns 0 if enough descriptors are available, 1 if there aren't
1215  *      enough descriptors and the packet has been queued, and 2 if the caller
1216  *      needs to retry because there weren't enough descriptors at the
1217  *      beginning of the call but some freed up in the mean time.
1218  */
1219 static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
1220                                    struct sk_buff *skb, unsigned int ndesc,
1221                                    unsigned int qid)
1222 {
1223         if (unlikely(!skb_queue_empty(&q->sendq))) {
1224               addq_exit:__skb_queue_tail(&q->sendq, skb);
1225                 return 1;
1226         }
1227         if (unlikely(q->size - q->in_use < ndesc)) {
1228                 struct sge_qset *qs = txq_to_qset(q, qid);
1229
1230                 set_bit(qid, &qs->txq_stopped);
1231                 smp_mb__after_clear_bit();
1232
1233                 if (should_restart_tx(q) &&
1234                     test_and_clear_bit(qid, &qs->txq_stopped))
1235                         return 2;
1236
1237                 q->stops++;
1238                 goto addq_exit;
1239         }
1240         return 0;
1241 }
1242
1243 /**
1244  *      reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1245  *      @q: the SGE control Tx queue
1246  *
1247  *      This is a variant of reclaim_completed_tx() that is used for Tx queues
1248  *      that send only immediate data (presently just the control queues) and
1249  *      thus do not have any sk_buffs to release.
1250  */
1251 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
1252 {
1253         unsigned int reclaim = q->processed - q->cleaned;
1254
1255         q->in_use -= reclaim;
1256         q->cleaned += reclaim;
1257 }
1258
1259 static inline int immediate(const struct sk_buff *skb)
1260 {
1261         return skb->len <= WR_LEN && !skb->data_len;
1262 }
1263
1264 /**
1265  *      ctrl_xmit - send a packet through an SGE control Tx queue
1266  *      @adap: the adapter
1267  *      @q: the control queue
1268  *      @skb: the packet
1269  *
1270  *      Send a packet through an SGE control Tx queue.  Packets sent through
1271  *      a control queue must fit entirely as immediate data in a single Tx
1272  *      descriptor and have no page fragments.
1273  */
1274 static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
1275                      struct sk_buff *skb)
1276 {
1277         int ret;
1278         struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
1279
1280         if (unlikely(!immediate(skb))) {
1281                 WARN_ON(1);
1282                 dev_kfree_skb(skb);
1283                 return NET_XMIT_SUCCESS;
1284         }
1285
1286         wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1287         wrp->wr_lo = htonl(V_WR_TID(q->token));
1288
1289         spin_lock(&q->lock);
1290       again:reclaim_completed_tx_imm(q);
1291
1292         ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
1293         if (unlikely(ret)) {
1294                 if (ret == 1) {
1295                         spin_unlock(&q->lock);
1296                         return NET_XMIT_CN;
1297                 }
1298                 goto again;
1299         }
1300
1301         write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1302
1303         q->in_use++;
1304         if (++q->pidx >= q->size) {
1305                 q->pidx = 0;
1306                 q->gen ^= 1;
1307         }
1308         spin_unlock(&q->lock);
1309         wmb();
1310         t3_write_reg(adap, A_SG_KDOORBELL,
1311                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1312         return NET_XMIT_SUCCESS;
1313 }
1314
1315 /**
1316  *      restart_ctrlq - restart a suspended control queue
1317  *      @qs: the queue set cotaining the control queue
1318  *
1319  *      Resumes transmission on a suspended Tx control queue.
1320  */
1321 static void restart_ctrlq(unsigned long data)
1322 {
1323         struct sk_buff *skb;
1324         struct sge_qset *qs = (struct sge_qset *)data;
1325         struct sge_txq *q = &qs->txq[TXQ_CTRL];
1326
1327         spin_lock(&q->lock);
1328       again:reclaim_completed_tx_imm(q);
1329
1330         while (q->in_use < q->size &&
1331                (skb = __skb_dequeue(&q->sendq)) != NULL) {
1332
1333                 write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1334
1335                 if (++q->pidx >= q->size) {
1336                         q->pidx = 0;
1337                         q->gen ^= 1;
1338                 }
1339                 q->in_use++;
1340         }
1341
1342         if (!skb_queue_empty(&q->sendq)) {
1343                 set_bit(TXQ_CTRL, &qs->txq_stopped);
1344                 smp_mb__after_clear_bit();
1345
1346                 if (should_restart_tx(q) &&
1347                     test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1348                         goto again;
1349                 q->stops++;
1350         }
1351
1352         spin_unlock(&q->lock);
1353         t3_write_reg(qs->adap, A_SG_KDOORBELL,
1354                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1355 }
1356
1357 /*
1358  * Send a management message through control queue 0
1359  */
1360 int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
1361 {
1362         return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
1363 }
1364
1365 /**
1366  *      deferred_unmap_destructor - unmap a packet when it is freed
1367  *      @skb: the packet
1368  *
1369  *      This is the packet destructor used for Tx packets that need to remain
1370  *      mapped until they are freed rather than until their Tx descriptors are
1371  *      freed.
1372  */
1373 static void deferred_unmap_destructor(struct sk_buff *skb)
1374 {
1375         int i;
1376         const dma_addr_t *p;
1377         const struct skb_shared_info *si;
1378         const struct deferred_unmap_info *dui;
1379         const struct unmap_info *ui = (struct unmap_info *)skb->cb;
1380
1381         dui = (struct deferred_unmap_info *)skb->head;
1382         p = dui->addr;
1383
1384         if (ui->len)
1385                 pci_unmap_single(dui->pdev, *p++, ui->len, PCI_DMA_TODEVICE);
1386
1387         si = skb_shinfo(skb);
1388         for (i = 0; i < si->nr_frags; i++)
1389                 pci_unmap_page(dui->pdev, *p++, si->frags[i].size,
1390                                PCI_DMA_TODEVICE);
1391 }
1392
1393 static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
1394                                      const struct sg_ent *sgl, int sgl_flits)
1395 {
1396         dma_addr_t *p;
1397         struct deferred_unmap_info *dui;
1398
1399         dui = (struct deferred_unmap_info *)skb->head;
1400         dui->pdev = pdev;
1401         for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
1402                 *p++ = be64_to_cpu(sgl->addr[0]);
1403                 *p++ = be64_to_cpu(sgl->addr[1]);
1404         }
1405         if (sgl_flits)
1406                 *p = be64_to_cpu(sgl->addr[0]);
1407 }
1408
1409 /**
1410  *      write_ofld_wr - write an offload work request
1411  *      @adap: the adapter
1412  *      @skb: the packet to send
1413  *      @q: the Tx queue
1414  *      @pidx: index of the first Tx descriptor to write
1415  *      @gen: the generation value to use
1416  *      @ndesc: number of descriptors the packet will occupy
1417  *
1418  *      Write an offload work request to send the supplied packet.  The packet
1419  *      data already carry the work request with most fields populated.
1420  */
1421 static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
1422                           struct sge_txq *q, unsigned int pidx,
1423                           unsigned int gen, unsigned int ndesc)
1424 {
1425         unsigned int sgl_flits, flits;
1426         struct work_request_hdr *from;
1427         struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1428         struct tx_desc *d = &q->desc[pidx];
1429
1430         if (immediate(skb)) {
1431                 q->sdesc[pidx].skb = NULL;
1432                 write_imm(d, skb, skb->len, gen);
1433                 return;
1434         }
1435
1436         /* Only TX_DATA builds SGLs */
1437
1438         from = (struct work_request_hdr *)skb->data;
1439         memcpy(&d->flit[1], &from[1],
1440                skb_transport_offset(skb) - sizeof(*from));
1441
1442         flits = skb_transport_offset(skb) / 8;
1443         sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1444         sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb),
1445                              skb->tail - skb->transport_header,
1446                              adap->pdev);
1447         if (need_skb_unmap()) {
1448                 setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
1449                 skb->destructor = deferred_unmap_destructor;
1450                 ((struct unmap_info *)skb->cb)->len = (skb->tail -
1451                                                        skb->transport_header);
1452         }
1453
1454         write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
1455                          gen, from->wr_hi, from->wr_lo);
1456 }
1457
1458 /**
1459  *      calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1460  *      @skb: the packet
1461  *
1462  *      Returns the number of Tx descriptors needed for the given offload
1463  *      packet.  These packets are already fully constructed.
1464  */
1465 static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
1466 {
1467         unsigned int flits, cnt = skb_shinfo(skb)->nr_frags;
1468
1469         if (skb->len <= WR_LEN && cnt == 0)
1470                 return 1;       /* packet fits as immediate data */
1471
1472         flits = skb_transport_offset(skb) / 8;  /* headers */
1473         if (skb->tail != skb->transport_header)
1474                 cnt++;
1475         return flits_to_desc(flits + sgl_len(cnt));
1476 }
1477
1478 /**
1479  *      ofld_xmit - send a packet through an offload queue
1480  *      @adap: the adapter
1481  *      @q: the Tx offload queue
1482  *      @skb: the packet
1483  *
1484  *      Send an offload packet through an SGE offload queue.
1485  */
1486 static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
1487                      struct sk_buff *skb)
1488 {
1489         int ret;
1490         unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
1491
1492         spin_lock(&q->lock);
1493       again:reclaim_completed_tx(adap, q);
1494
1495         ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
1496         if (unlikely(ret)) {
1497                 if (ret == 1) {
1498                         skb->priority = ndesc;  /* save for restart */
1499                         spin_unlock(&q->lock);
1500                         return NET_XMIT_CN;
1501                 }
1502                 goto again;
1503         }
1504
1505         gen = q->gen;
1506         q->in_use += ndesc;
1507         pidx = q->pidx;
1508         q->pidx += ndesc;
1509         if (q->pidx >= q->size) {
1510                 q->pidx -= q->size;
1511                 q->gen ^= 1;
1512         }
1513         spin_unlock(&q->lock);
1514
1515         write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
1516         check_ring_tx_db(adap, q);
1517         return NET_XMIT_SUCCESS;
1518 }
1519
1520 /**
1521  *      restart_offloadq - restart a suspended offload queue
1522  *      @qs: the queue set cotaining the offload queue
1523  *
1524  *      Resumes transmission on a suspended Tx offload queue.
1525  */
1526 static void restart_offloadq(unsigned long data)
1527 {
1528         struct sk_buff *skb;
1529         struct sge_qset *qs = (struct sge_qset *)data;
1530         struct sge_txq *q = &qs->txq[TXQ_OFLD];
1531         const struct port_info *pi = netdev_priv(qs->netdev);
1532         struct adapter *adap = pi->adapter;
1533
1534         spin_lock(&q->lock);
1535       again:reclaim_completed_tx(adap, q);
1536
1537         while ((skb = skb_peek(&q->sendq)) != NULL) {
1538                 unsigned int gen, pidx;
1539                 unsigned int ndesc = skb->priority;
1540
1541                 if (unlikely(q->size - q->in_use < ndesc)) {
1542                         set_bit(TXQ_OFLD, &qs->txq_stopped);
1543                         smp_mb__after_clear_bit();
1544
1545                         if (should_restart_tx(q) &&
1546                             test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1547                                 goto again;
1548                         q->stops++;
1549                         break;
1550                 }
1551
1552                 gen = q->gen;
1553                 q->in_use += ndesc;
1554                 pidx = q->pidx;
1555                 q->pidx += ndesc;
1556                 if (q->pidx >= q->size) {
1557                         q->pidx -= q->size;
1558                         q->gen ^= 1;
1559                 }
1560                 __skb_unlink(skb, &q->sendq);
1561                 spin_unlock(&q->lock);
1562
1563                 write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
1564                 spin_lock(&q->lock);
1565         }
1566         spin_unlock(&q->lock);
1567
1568 #if USE_GTS
1569         set_bit(TXQ_RUNNING, &q->flags);
1570         set_bit(TXQ_LAST_PKT_DB, &q->flags);
1571 #endif
1572         t3_write_reg(adap, A_SG_KDOORBELL,
1573                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1574 }
1575
1576 /**
1577  *      queue_set - return the queue set a packet should use
1578  *      @skb: the packet
1579  *
1580  *      Maps a packet to the SGE queue set it should use.  The desired queue
1581  *      set is carried in bits 1-3 in the packet's priority.
1582  */
1583 static inline int queue_set(const struct sk_buff *skb)
1584 {
1585         return skb->priority >> 1;
1586 }
1587
1588 /**
1589  *      is_ctrl_pkt - return whether an offload packet is a control packet
1590  *      @skb: the packet
1591  *
1592  *      Determines whether an offload packet should use an OFLD or a CTRL
1593  *      Tx queue.  This is indicated by bit 0 in the packet's priority.
1594  */
1595 static inline int is_ctrl_pkt(const struct sk_buff *skb)
1596 {
1597         return skb->priority & 1;
1598 }
1599
1600 /**
1601  *      t3_offload_tx - send an offload packet
1602  *      @tdev: the offload device to send to
1603  *      @skb: the packet
1604  *
1605  *      Sends an offload packet.  We use the packet priority to select the
1606  *      appropriate Tx queue as follows: bit 0 indicates whether the packet
1607  *      should be sent as regular or control, bits 1-3 select the queue set.
1608  */
1609 int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
1610 {
1611         struct adapter *adap = tdev2adap(tdev);
1612         struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
1613
1614         if (unlikely(is_ctrl_pkt(skb)))
1615                 return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
1616
1617         return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
1618 }
1619
1620 /**
1621  *      offload_enqueue - add an offload packet to an SGE offload receive queue
1622  *      @q: the SGE response queue
1623  *      @skb: the packet
1624  *
1625  *      Add a new offload packet to an SGE response queue's offload packet
1626  *      queue.  If the packet is the first on the queue it schedules the RX
1627  *      softirq to process the queue.
1628  */
1629 static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
1630 {
1631         skb->next = skb->prev = NULL;
1632         if (q->rx_tail)
1633                 q->rx_tail->next = skb;
1634         else {
1635                 struct sge_qset *qs = rspq_to_qset(q);
1636
1637                 napi_schedule(&qs->napi);
1638                 q->rx_head = skb;
1639         }
1640         q->rx_tail = skb;
1641 }
1642
1643 /**
1644  *      deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1645  *      @tdev: the offload device that will be receiving the packets
1646  *      @q: the SGE response queue that assembled the bundle
1647  *      @skbs: the partial bundle
1648  *      @n: the number of packets in the bundle
1649  *
1650  *      Delivers a (partial) bundle of Rx offload packets to an offload device.
1651  */
1652 static inline void deliver_partial_bundle(struct t3cdev *tdev,
1653                                           struct sge_rspq *q,
1654                                           struct sk_buff *skbs[], int n)
1655 {
1656         if (n) {
1657                 q->offload_bundles++;
1658                 tdev->recv(tdev, skbs, n);
1659         }
1660 }
1661
1662 /**
1663  *      ofld_poll - NAPI handler for offload packets in interrupt mode
1664  *      @dev: the network device doing the polling
1665  *      @budget: polling budget
1666  *
1667  *      The NAPI handler for offload packets when a response queue is serviced
1668  *      by the hard interrupt handler, i.e., when it's operating in non-polling
1669  *      mode.  Creates small packet batches and sends them through the offload
1670  *      receive handler.  Batches need to be of modest size as we do prefetches
1671  *      on the packets in each.
1672  */
1673 static int ofld_poll(struct napi_struct *napi, int budget)
1674 {
1675         struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
1676         struct sge_rspq *q = &qs->rspq;
1677         struct adapter *adapter = qs->adap;
1678         int work_done = 0;
1679
1680         while (work_done < budget) {
1681                 struct sk_buff *head, *tail, *skbs[RX_BUNDLE_SIZE];
1682                 int ngathered;
1683
1684                 spin_lock_irq(&q->lock);
1685                 head = q->rx_head;
1686                 if (!head) {
1687                         napi_complete(napi);
1688                         spin_unlock_irq(&q->lock);
1689                         return work_done;
1690                 }
1691
1692                 tail = q->rx_tail;
1693                 q->rx_head = q->rx_tail = NULL;
1694                 spin_unlock_irq(&q->lock);
1695
1696                 for (ngathered = 0; work_done < budget && head; work_done++) {
1697                         prefetch(head->data);
1698                         skbs[ngathered] = head;
1699                         head = head->next;
1700                         skbs[ngathered]->next = NULL;
1701                         if (++ngathered == RX_BUNDLE_SIZE) {
1702                                 q->offload_bundles++;
1703                                 adapter->tdev.recv(&adapter->tdev, skbs,
1704                                                    ngathered);
1705                                 ngathered = 0;
1706                         }
1707                 }
1708                 if (head) {     /* splice remaining packets back onto Rx queue */
1709                         spin_lock_irq(&q->lock);
1710                         tail->next = q->rx_head;
1711                         if (!q->rx_head)
1712                                 q->rx_tail = tail;
1713                         q->rx_head = head;
1714                         spin_unlock_irq(&q->lock);
1715                 }
1716                 deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
1717         }
1718
1719         return work_done;
1720 }
1721
1722 /**
1723  *      rx_offload - process a received offload packet
1724  *      @tdev: the offload device receiving the packet
1725  *      @rq: the response queue that received the packet
1726  *      @skb: the packet
1727  *      @rx_gather: a gather list of packets if we are building a bundle
1728  *      @gather_idx: index of the next available slot in the bundle
1729  *
1730  *      Process an ingress offload pakcet and add it to the offload ingress
1731  *      queue.  Returns the index of the next available slot in the bundle.
1732  */
1733 static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1734                              struct sk_buff *skb, struct sk_buff *rx_gather[],
1735                              unsigned int gather_idx)
1736 {
1737         rq->offload_pkts++;
1738         skb_reset_mac_header(skb);
1739         skb_reset_network_header(skb);
1740         skb_reset_transport_header(skb);
1741
1742         if (rq->polling) {
1743                 rx_gather[gather_idx++] = skb;
1744                 if (gather_idx == RX_BUNDLE_SIZE) {
1745                         tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1746                         gather_idx = 0;
1747                         rq->offload_bundles++;
1748                 }
1749         } else
1750                 offload_enqueue(rq, skb);
1751
1752         return gather_idx;
1753 }
1754
1755 /**
1756  *      restart_tx - check whether to restart suspended Tx queues
1757  *      @qs: the queue set to resume
1758  *
1759  *      Restarts suspended Tx queues of an SGE queue set if they have enough
1760  *      free resources to resume operation.
1761  */
1762 static void restart_tx(struct sge_qset *qs)
1763 {
1764         if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
1765             should_restart_tx(&qs->txq[TXQ_ETH]) &&
1766             test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1767                 qs->txq[TXQ_ETH].restarts++;
1768                 if (netif_running(qs->netdev))
1769                         netif_wake_queue(qs->netdev);
1770         }
1771
1772         if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
1773             should_restart_tx(&qs->txq[TXQ_OFLD]) &&
1774             test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
1775                 qs->txq[TXQ_OFLD].restarts++;
1776                 tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk);
1777         }
1778         if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
1779             should_restart_tx(&qs->txq[TXQ_CTRL]) &&
1780             test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
1781                 qs->txq[TXQ_CTRL].restarts++;
1782                 tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk);
1783         }
1784 }
1785
1786 /**
1787  *      rx_eth - process an ingress ethernet packet
1788  *      @adap: the adapter
1789  *      @rq: the response queue that received the packet
1790  *      @skb: the packet
1791  *      @pad: amount of padding at the start of the buffer
1792  *
1793  *      Process an ingress ethernet pakcet and deliver it to the stack.
1794  *      The padding is 2 if the packet was delivered in an Rx buffer and 0
1795  *      if it was immediate data in a response.
1796  */
1797 static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
1798                    struct sk_buff *skb, int pad)
1799 {
1800         struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
1801         struct port_info *pi;
1802
1803         skb_pull(skb, sizeof(*p) + pad);
1804         skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
1805         skb->dev->last_rx = jiffies;
1806         pi = netdev_priv(skb->dev);
1807         if (pi->rx_csum_offload && p->csum_valid && p->csum == 0xffff &&
1808             !p->fragment) {
1809                 rspq_to_qset(rq)->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1810                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1811         } else
1812                 skb->ip_summed = CHECKSUM_NONE;
1813
1814         if (unlikely(p->vlan_valid)) {
1815                 struct vlan_group *grp = pi->vlan_grp;
1816
1817                 rspq_to_qset(rq)->port_stats[SGE_PSTAT_VLANEX]++;
1818                 if (likely(grp))
1819                         __vlan_hwaccel_rx(skb, grp, ntohs(p->vlan),
1820                                           rq->polling);
1821                 else
1822                         dev_kfree_skb_any(skb);
1823         } else if (rq->polling)
1824                 netif_receive_skb(skb);
1825         else
1826                 netif_rx(skb);
1827 }
1828
1829 /**
1830  *      handle_rsp_cntrl_info - handles control information in a response
1831  *      @qs: the queue set corresponding to the response
1832  *      @flags: the response control flags
1833  *
1834  *      Handles the control information of an SGE response, such as GTS
1835  *      indications and completion credits for the queue set's Tx queues.
1836  *      HW coalesces credits, we don't do any extra SW coalescing.
1837  */
1838 static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
1839 {
1840         unsigned int credits;
1841
1842 #if USE_GTS
1843         if (flags & F_RSPD_TXQ0_GTS)
1844                 clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
1845 #endif
1846
1847         credits = G_RSPD_TXQ0_CR(flags);
1848         if (credits)
1849                 qs->txq[TXQ_ETH].processed += credits;
1850
1851         credits = G_RSPD_TXQ2_CR(flags);
1852         if (credits)
1853                 qs->txq[TXQ_CTRL].processed += credits;
1854
1855 # if USE_GTS
1856         if (flags & F_RSPD_TXQ1_GTS)
1857                 clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
1858 # endif
1859         credits = G_RSPD_TXQ1_CR(flags);
1860         if (credits)
1861                 qs->txq[TXQ_OFLD].processed += credits;
1862 }
1863
1864 /**
1865  *      check_ring_db - check if we need to ring any doorbells
1866  *      @adapter: the adapter
1867  *      @qs: the queue set whose Tx queues are to be examined
1868  *      @sleeping: indicates which Tx queue sent GTS
1869  *
1870  *      Checks if some of a queue set's Tx queues need to ring their doorbells
1871  *      to resume transmission after idling while they still have unprocessed
1872  *      descriptors.
1873  */
1874 static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
1875                           unsigned int sleeping)
1876 {
1877         if (sleeping & F_RSPD_TXQ0_GTS) {
1878                 struct sge_txq *txq = &qs->txq[TXQ_ETH];
1879
1880                 if (txq->cleaned + txq->in_use != txq->processed &&
1881                     !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
1882                         set_bit(TXQ_RUNNING, &txq->flags);
1883                         t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
1884                                      V_EGRCNTX(txq->cntxt_id));
1885                 }
1886         }
1887
1888         if (sleeping & F_RSPD_TXQ1_GTS) {
1889                 struct sge_txq *txq = &qs->txq[TXQ_OFLD];
1890
1891                 if (txq->cleaned + txq->in_use != txq->processed &&
1892                     !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
1893                         set_bit(TXQ_RUNNING, &txq->flags);
1894                         t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
1895                                      V_EGRCNTX(txq->cntxt_id));
1896                 }
1897         }
1898 }
1899
1900 /**
1901  *      is_new_response - check if a response is newly written
1902  *      @r: the response descriptor
1903  *      @q: the response queue
1904  *
1905  *      Returns true if a response descriptor contains a yet unprocessed
1906  *      response.
1907  */
1908 static inline int is_new_response(const struct rsp_desc *r,
1909                                   const struct sge_rspq *q)
1910 {
1911         return (r->intr_gen & F_RSPD_GEN2) == q->gen;
1912 }
1913
1914 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
1915 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
1916                         V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
1917                         V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
1918                         V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
1919
1920 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
1921 #define NOMEM_INTR_DELAY 2500
1922
1923 /**
1924  *      process_responses - process responses from an SGE response queue
1925  *      @adap: the adapter
1926  *      @qs: the queue set to which the response queue belongs
1927  *      @budget: how many responses can be processed in this round
1928  *
1929  *      Process responses from an SGE response queue up to the supplied budget.
1930  *      Responses include received packets as well as credits and other events
1931  *      for the queues that belong to the response queue's queue set.
1932  *      A negative budget is effectively unlimited.
1933  *
1934  *      Additionally choose the interrupt holdoff time for the next interrupt
1935  *      on this queue.  If the system is under memory shortage use a fairly
1936  *      long delay to help recovery.
1937  */
1938 static int process_responses(struct adapter *adap, struct sge_qset *qs,
1939                              int budget)
1940 {
1941         struct sge_rspq *q = &qs->rspq;
1942         struct rsp_desc *r = &q->desc[q->cidx];
1943         int budget_left = budget;
1944         unsigned int sleeping = 0;
1945         struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
1946         int ngathered = 0;
1947
1948         q->next_holdoff = q->holdoff_tmr;
1949
1950         while (likely(budget_left && is_new_response(r, q))) {
1951                 int eth, ethpad = 2;
1952                 struct sk_buff *skb = NULL;
1953                 u32 len, flags = ntohl(r->flags);
1954                 u32 rss_hi = *(const u32 *)r, rss_lo = r->rss_hdr.rss_hash_val;
1955
1956                 eth = r->rss_hdr.opcode == CPL_RX_PKT;
1957
1958                 if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
1959                         skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
1960                         if (!skb)
1961                                 goto no_mem;
1962
1963                         memcpy(__skb_put(skb, AN_PKT_SIZE), r, AN_PKT_SIZE);
1964                         skb->data[0] = CPL_ASYNC_NOTIF;
1965                         rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
1966                         q->async_notif++;
1967                 } else if (flags & F_RSPD_IMM_DATA_VALID) {
1968                         skb = get_imm_packet(r);
1969                         if (unlikely(!skb)) {
1970 no_mem:
1971                                 q->next_holdoff = NOMEM_INTR_DELAY;
1972                                 q->nomem++;
1973                                 /* consume one credit since we tried */
1974                                 budget_left--;
1975                                 break;
1976                         }
1977                         q->imm_data++;
1978                         ethpad = 0;
1979                 } else if ((len = ntohl(r->len_cq)) != 0) {
1980                         struct sge_fl *fl;
1981
1982                         fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
1983                         if (fl->use_pages) {
1984                                 void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
1985
1986                                 prefetch(addr);
1987 #if L1_CACHE_BYTES < 128
1988                                 prefetch(addr + L1_CACHE_BYTES);
1989 #endif
1990                                 __refill_fl(adap, fl);
1991
1992                                 skb = get_packet_pg(adap, fl, G_RSPD_LEN(len),
1993                                                  eth ? SGE_RX_DROP_THRES : 0);
1994                         } else
1995                                 skb = get_packet(adap, fl, G_RSPD_LEN(len),
1996                                                  eth ? SGE_RX_DROP_THRES : 0);
1997                         if (unlikely(!skb)) {
1998                                 if (!eth)
1999                                         goto no_mem;
2000                                 q->rx_drops++;
2001                         } else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
2002                                 __skb_pull(skb, 2);
2003
2004                         if (++fl->cidx == fl->size)
2005                                 fl->cidx = 0;
2006                 } else
2007                         q->pure_rsps++;
2008
2009                 if (flags & RSPD_CTRL_MASK) {
2010                         sleeping |= flags & RSPD_GTS_MASK;
2011                         handle_rsp_cntrl_info(qs, flags);
2012                 }
2013
2014                 r++;
2015                 if (unlikely(++q->cidx == q->size)) {
2016                         q->cidx = 0;
2017                         q->gen ^= 1;
2018                         r = q->desc;
2019                 }
2020                 prefetch(r);
2021
2022                 if (++q->credits >= (q->size / 4)) {
2023                         refill_rspq(adap, q, q->credits);
2024                         q->credits = 0;
2025                 }
2026
2027                 if (likely(skb != NULL)) {
2028                         if (eth)
2029                                 rx_eth(adap, q, skb, ethpad);
2030                         else {
2031                                 /* Preserve the RSS info in csum & priority */
2032                                 skb->csum = rss_hi;
2033                                 skb->priority = rss_lo;
2034                                 ngathered = rx_offload(&adap->tdev, q, skb,
2035                                                        offload_skbs,
2036                                                        ngathered);
2037                         }
2038                 }
2039                 --budget_left;
2040         }
2041
2042         deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
2043         if (sleeping)
2044                 check_ring_db(adap, qs, sleeping);
2045
2046         smp_mb();               /* commit Tx queue .processed updates */
2047         if (unlikely(qs->txq_stopped != 0))
2048                 restart_tx(qs);
2049
2050         budget -= budget_left;
2051         return budget;
2052 }
2053
2054 static inline int is_pure_response(const struct rsp_desc *r)
2055 {
2056         u32 n = ntohl(r->flags) & (F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
2057
2058         return (n | r->len_cq) == 0;
2059 }
2060
2061 /**
2062  *      napi_rx_handler - the NAPI handler for Rx processing
2063  *      @napi: the napi instance
2064  *      @budget: how many packets we can process in this round
2065  *
2066  *      Handler for new data events when using NAPI.
2067  */
2068 static int napi_rx_handler(struct napi_struct *napi, int budget)
2069 {
2070         struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
2071         struct adapter *adap = qs->adap;
2072         int work_done = process_responses(adap, qs, budget);
2073
2074         if (likely(work_done < budget)) {
2075                 napi_complete(napi);
2076
2077                 /*
2078                  * Because we don't atomically flush the following
2079                  * write it is possible that in very rare cases it can
2080                  * reach the device in a way that races with a new
2081                  * response being written plus an error interrupt
2082                  * causing the NAPI interrupt handler below to return
2083                  * unhandled status to the OS.  To protect against
2084                  * this would require flushing the write and doing
2085                  * both the write and the flush with interrupts off.
2086                  * Way too expensive and unjustifiable given the
2087                  * rarity of the race.
2088                  *
2089                  * The race cannot happen at all with MSI-X.
2090                  */
2091                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2092                              V_NEWTIMER(qs->rspq.next_holdoff) |
2093                              V_NEWINDEX(qs->rspq.cidx));
2094         }
2095         return work_done;
2096 }
2097
2098 /*
2099  * Returns true if the device is already scheduled for polling.
2100  */
2101 static inline int napi_is_scheduled(struct napi_struct *napi)
2102 {
2103         return test_bit(NAPI_STATE_SCHED, &napi->state);
2104 }
2105
2106 /**
2107  *      process_pure_responses - process pure responses from a response queue
2108  *      @adap: the adapter
2109  *      @qs: the queue set owning the response queue
2110  *      @r: the first pure response to process
2111  *
2112  *      A simpler version of process_responses() that handles only pure (i.e.,
2113  *      non data-carrying) responses.  Such respones are too light-weight to
2114  *      justify calling a softirq under NAPI, so we handle them specially in
2115  *      the interrupt handler.  The function is called with a pointer to a
2116  *      response, which the caller must ensure is a valid pure response.
2117  *
2118  *      Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
2119  */
2120 static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
2121                                   struct rsp_desc *r)
2122 {
2123         struct sge_rspq *q = &qs->rspq;
2124         unsigned int sleeping = 0;
2125
2126         do {
2127                 u32 flags = ntohl(r->flags);
2128
2129                 r++;
2130                 if (unlikely(++q->cidx == q->size)) {
2131                         q->cidx = 0;
2132                         q->gen ^= 1;
2133                         r = q->desc;
2134                 }
2135                 prefetch(r);
2136
2137                 if (flags & RSPD_CTRL_MASK) {
2138                         sleeping |= flags & RSPD_GTS_MASK;
2139                         handle_rsp_cntrl_info(qs, flags);
2140                 }
2141
2142                 q->pure_rsps++;
2143                 if (++q->credits >= (q->size / 4)) {
2144                         refill_rspq(adap, q, q->credits);
2145                         q->credits = 0;
2146                 }
2147         } while (is_new_response(r, q) && is_pure_response(r));
2148
2149         if (sleeping)
2150                 check_ring_db(adap, qs, sleeping);
2151
2152         smp_mb();               /* commit Tx queue .processed updates */
2153         if (unlikely(qs->txq_stopped != 0))
2154                 restart_tx(qs);
2155
2156         return is_new_response(r, q);
2157 }
2158
2159 /**
2160  *      handle_responses - decide what to do with new responses in NAPI mode
2161  *      @adap: the adapter
2162  *      @q: the response queue
2163  *
2164  *      This is used by the NAPI interrupt handlers to decide what to do with
2165  *      new SGE responses.  If there are no new responses it returns -1.  If
2166  *      there are new responses and they are pure (i.e., non-data carrying)
2167  *      it handles them straight in hard interrupt context as they are very
2168  *      cheap and don't deliver any packets.  Finally, if there are any data
2169  *      signaling responses it schedules the NAPI handler.  Returns 1 if it
2170  *      schedules NAPI, 0 if all new responses were pure.
2171  *
2172  *      The caller must ascertain NAPI is not already running.
2173  */
2174 static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
2175 {
2176         struct sge_qset *qs = rspq_to_qset(q);
2177         struct rsp_desc *r = &q->desc[q->cidx];
2178
2179         if (!is_new_response(r, q))
2180                 return -1;
2181         if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
2182                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2183                              V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
2184                 return 0;
2185         }
2186         napi_schedule(&qs->napi);
2187         return 1;
2188 }
2189
2190 /*
2191  * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
2192  * (i.e., response queue serviced in hard interrupt).
2193  */
2194 irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
2195 {
2196         struct sge_qset *qs = cookie;
2197         struct adapter *adap = qs->adap;
2198         struct sge_rspq *q = &qs->rspq;
2199
2200         spin_lock(&q->lock);
2201         if (process_responses(adap, qs, -1) == 0)
2202                 q->unhandled_irqs++;
2203         t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2204                      V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2205         spin_unlock(&q->lock);
2206         return IRQ_HANDLED;
2207 }
2208
2209 /*
2210  * The MSI-X interrupt handler for an SGE response queue for the NAPI case
2211  * (i.e., response queue serviced by NAPI polling).
2212  */
2213 irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
2214 {
2215         struct sge_qset *qs = cookie;
2216         struct sge_rspq *q = &qs->rspq;
2217
2218         spin_lock(&q->lock);
2219
2220         if (handle_responses(qs->adap, q) < 0)
2221                 q->unhandled_irqs++;
2222         spin_unlock(&q->lock);
2223         return IRQ_HANDLED;
2224 }
2225
2226 /*
2227  * The non-NAPI MSI interrupt handler.  This needs to handle data events from
2228  * SGE response queues as well as error and other async events as they all use
2229  * the same MSI vector.  We use one SGE response queue per port in this mode
2230  * and protect all response queues with queue 0's lock.
2231  */
2232 static irqreturn_t t3_intr_msi(int irq, void *cookie)
2233 {
2234         int new_packets = 0;
2235         struct adapter *adap = cookie;
2236         struct sge_rspq *q = &adap->sge.qs[0].rspq;
2237
2238         spin_lock(&q->lock);
2239
2240         if (process_responses(adap, &adap->sge.qs[0], -1)) {
2241                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2242                              V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2243                 new_packets = 1;
2244         }
2245
2246         if (adap->params.nports == 2 &&
2247             process_responses(adap, &adap->sge.qs[1], -1)) {
2248                 struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2249
2250                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
2251                              V_NEWTIMER(q1->next_holdoff) |
2252                              V_NEWINDEX(q1->cidx));
2253                 new_packets = 1;
2254         }
2255
2256         if (!new_packets && t3_slow_intr_handler(adap) == 0)
2257                 q->unhandled_irqs++;
2258
2259         spin_unlock(&q->lock);
2260         return IRQ_HANDLED;
2261 }
2262
2263 static int rspq_check_napi(struct sge_qset *qs)
2264 {
2265         struct sge_rspq *q = &qs->rspq;
2266
2267         if (!napi_is_scheduled(&qs->napi) &&
2268             is_new_response(&q->desc[q->cidx], q)) {
2269                 napi_schedule(&qs->napi);
2270                 return 1;
2271         }
2272         return 0;
2273 }
2274
2275 /*
2276  * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
2277  * by NAPI polling).  Handles data events from SGE response queues as well as
2278  * error and other async events as they all use the same MSI vector.  We use
2279  * one SGE response queue per port in this mode and protect all response
2280  * queues with queue 0's lock.
2281  */
2282 irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
2283 {
2284         int new_packets;
2285         struct adapter *adap = cookie;
2286         struct sge_rspq *q = &adap->sge.qs[0].rspq;
2287
2288         spin_lock(&q->lock);
2289
2290         new_packets = rspq_check_napi(&adap->sge.qs[0]);
2291         if (adap->params.nports == 2)
2292                 new_packets += rspq_check_napi(&adap->sge.qs[1]);
2293         if (!new_packets && t3_slow_intr_handler(adap) == 0)
2294                 q->unhandled_irqs++;
2295
2296         spin_unlock(&q->lock);
2297         return IRQ_HANDLED;
2298 }
2299
2300 /*
2301  * A helper function that processes responses and issues GTS.
2302  */
2303 static inline int process_responses_gts(struct adapter *adap,
2304                                         struct sge_rspq *rq)
2305 {
2306         int work;
2307
2308         work = process_responses(adap, rspq_to_qset(rq), -1);
2309         t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2310                      V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2311         return work;
2312 }
2313
2314 /*
2315  * The legacy INTx interrupt handler.  This needs to handle data events from
2316  * SGE response queues as well as error and other async events as they all use
2317  * the same interrupt pin.  We use one SGE response queue per port in this mode
2318  * and protect all response queues with queue 0's lock.
2319  */
2320 static irqreturn_t t3_intr(int irq, void *cookie)
2321 {
2322         int work_done, w0, w1;
2323         struct adapter *adap = cookie;
2324         struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2325         struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2326
2327         spin_lock(&q0->lock);
2328
2329         w0 = is_new_response(&q0->desc[q0->cidx], q0);
2330         w1 = adap->params.nports == 2 &&
2331             is_new_response(&q1->desc[q1->cidx], q1);
2332
2333         if (likely(w0 | w1)) {
2334                 t3_write_reg(adap, A_PL_CLI, 0);
2335                 t3_read_reg(adap, A_PL_CLI);    /* flush */
2336
2337                 if (likely(w0))
2338                         process_responses_gts(adap, q0);
2339
2340                 if (w1)
2341                         process_responses_gts(adap, q1);
2342
2343                 work_done = w0 | w1;
2344         } else
2345                 work_done = t3_slow_intr_handler(adap);
2346
2347         spin_unlock(&q0->lock);
2348         return IRQ_RETVAL(work_done != 0);
2349 }
2350
2351 /*
2352  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2353  * Handles data events from SGE response queues as well as error and other
2354  * async events as they all use the same interrupt pin.  We use one SGE
2355  * response queue per port in this mode and protect all response queues with
2356  * queue 0's lock.
2357  */
2358 static irqreturn_t t3b_intr(int irq, void *cookie)
2359 {
2360         u32 map;
2361         struct adapter *adap = cookie;
2362         struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2363
2364         t3_write_reg(adap, A_PL_CLI, 0);
2365         map = t3_read_reg(adap, A_SG_DATA_INTR);
2366
2367         if (unlikely(!map))     /* shared interrupt, most likely */
2368                 return IRQ_NONE;
2369
2370         spin_lock(&q0->lock);
2371
2372         if (unlikely(map & F_ERRINTR))
2373                 t3_slow_intr_handler(adap);
2374
2375         if (likely(map & 1))
2376                 process_responses_gts(adap, q0);
2377
2378         if (map & 2)
2379                 process_responses_gts(adap, &adap->sge.qs[1].rspq);
2380
2381         spin_unlock(&q0->lock);
2382         return IRQ_HANDLED;
2383 }
2384
2385 /*
2386  * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
2387  * Handles data events from SGE response queues as well as error and other
2388  * async events as they all use the same interrupt pin.  We use one SGE
2389  * response queue per port in this mode and protect all response queues with
2390  * queue 0's lock.
2391  */
2392 static irqreturn_t t3b_intr_napi(int irq, void *cookie)
2393 {
2394         u32 map;
2395         struct adapter *adap = cookie;
2396         struct sge_qset *qs0 = &adap->sge.qs[0];
2397         struct sge_rspq *q0 = &qs0->rspq;
2398
2399         t3_write_reg(adap, A_PL_CLI, 0);
2400         map = t3_read_reg(adap, A_SG_DATA_INTR);
2401
2402         if (unlikely(!map))     /* shared interrupt, most likely */
2403                 return IRQ_NONE;
2404
2405         spin_lock(&q0->lock);
2406
2407         if (unlikely(map & F_ERRINTR))
2408                 t3_slow_intr_handler(adap);
2409
2410         if (likely(map & 1))
2411                 napi_schedule(&qs0->napi);
2412
2413         if (map & 2)
2414                 napi_schedule(&adap->sge.qs[1].napi);
2415
2416         spin_unlock(&q0->lock);
2417         return IRQ_HANDLED;
2418 }
2419
2420 /**
2421  *      t3_intr_handler - select the top-level interrupt handler
2422  *      @adap: the adapter
2423  *      @polling: whether using NAPI to service response queues
2424  *
2425  *      Selects the top-level interrupt handler based on the type of interrupts
2426  *      (MSI-X, MSI, or legacy) and whether NAPI will be used to service the
2427  *      response queues.
2428  */
2429 intr_handler_t t3_intr_handler(struct adapter *adap, int polling)
2430 {
2431         if (adap->flags & USING_MSIX)
2432                 return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
2433         if (adap->flags & USING_MSI)
2434                 return polling ? t3_intr_msi_napi : t3_intr_msi;
2435         if (adap->params.rev > 0)
2436                 return polling ? t3b_intr_napi : t3b_intr;
2437         return t3_intr;
2438 }
2439
2440 /**
2441  *      t3_sge_err_intr_handler - SGE async event interrupt handler
2442  *      @adapter: the adapter
2443  *
2444  *      Interrupt handler for SGE asynchronous (non-data) events.
2445  */
2446 void t3_sge_err_intr_handler(struct adapter *adapter)
2447 {
2448         unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE);
2449
2450         if (status & F_RSPQCREDITOVERFOW)
2451                 CH_ALERT(adapter, "SGE response queue credit overflow\n");
2452
2453         if (status & F_RSPQDISABLED) {
2454                 v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
2455
2456                 CH_ALERT(adapter,
2457                          "packet delivered to disabled response queue "
2458                          "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
2459         }
2460
2461         t3_write_reg(adapter, A_SG_INT_CAUSE, status);
2462         if (status & (F_RSPQCREDITOVERFOW | F_RSPQDISABLED))
2463                 t3_fatal_err(adapter);
2464 }
2465
2466 /**
2467  *      sge_timer_cb - perform periodic maintenance of an SGE qset
2468  *      @data: the SGE queue set to maintain
2469  *
2470  *      Runs periodically from a timer to perform maintenance of an SGE queue
2471  *      set.  It performs two tasks:
2472  *
2473  *      a) Cleans up any completed Tx descriptors that may still be pending.
2474  *      Normal descriptor cleanup happens when new packets are added to a Tx
2475  *      queue so this timer is relatively infrequent and does any cleanup only
2476  *      if the Tx queue has not seen any new packets in a while.  We make a
2477  *      best effort attempt to reclaim descriptors, in that we don't wait
2478  *      around if we cannot get a queue's lock (which most likely is because
2479  *      someone else is queueing new packets and so will also handle the clean
2480  *      up).  Since control queues use immediate data exclusively we don't
2481  *      bother cleaning them up here.
2482  *
2483  *      b) Replenishes Rx queues that have run out due to memory shortage.
2484  *      Normally new Rx buffers are added when existing ones are consumed but
2485  *      when out of memory a queue can become empty.  We try to add only a few
2486  *      buffers here, the queue will be replenished fully as these new buffers
2487  *      are used up if memory shortage has subsided.
2488  */
2489 static void sge_timer_cb(unsigned long data)
2490 {
2491         spinlock_t *lock;
2492         struct sge_qset *qs = (struct sge_qset *)data;
2493         struct adapter *adap = qs->adap;
2494
2495         if (spin_trylock(&qs->txq[TXQ_ETH].lock)) {
2496                 reclaim_completed_tx(adap, &qs->txq[TXQ_ETH]);
2497                 spin_unlock(&qs->txq[TXQ_ETH].lock);
2498         }
2499         if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
2500                 reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD]);
2501                 spin_unlock(&qs->txq[TXQ_OFLD].lock);
2502         }
2503         lock = (adap->flags & USING_MSIX) ? &qs->rspq.lock :
2504                                             &adap->sge.qs[0].rspq.lock;
2505         if (spin_trylock_irq(lock)) {
2506                 if (!napi_is_scheduled(&qs->napi)) {
2507                         u32 status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
2508
2509                         if (qs->fl[0].credits < qs->fl[0].size)
2510                                 __refill_fl(adap, &qs->fl[0]);
2511                         if (qs->fl[1].credits < qs->fl[1].size)
2512                                 __refill_fl(adap, &qs->fl[1]);
2513
2514                         if (status & (1 << qs->rspq.cntxt_id)) {
2515                                 qs->rspq.starved++;
2516                                 if (qs->rspq.credits) {
2517                                         refill_rspq(adap, &qs->rspq, 1);
2518                                         qs->rspq.credits--;
2519                                         qs->rspq.restarted++;
2520                                         t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
2521                                                      1 << qs->rspq.cntxt_id);
2522                                 }
2523                         }
2524                 }
2525                 spin_unlock_irq(lock);
2526         }
2527         mod_timer(&qs->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
2528 }
2529
2530 /**
2531  *      t3_update_qset_coalesce - update coalescing settings for a queue set
2532  *      @qs: the SGE queue set
2533  *      @p: new queue set parameters
2534  *
2535  *      Update the coalescing settings for an SGE queue set.  Nothing is done
2536  *      if the queue set is not initialized yet.
2537  */
2538 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
2539 {
2540         qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
2541         qs->rspq.polling = p->polling;
2542         qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
2543 }
2544
2545 /**
2546  *      t3_sge_alloc_qset - initialize an SGE queue set
2547  *      @adapter: the adapter
2548  *      @id: the queue set id
2549  *      @nports: how many Ethernet ports will be using this queue set
2550  *      @irq_vec_idx: the IRQ vector index for response queue interrupts
2551  *      @p: configuration parameters for this queue set
2552  *      @ntxq: number of Tx queues for the queue set
2553  *      @netdev: net device associated with this queue set
2554  *
2555  *      Allocate resources and initialize an SGE queue set.  A queue set
2556  *      comprises a response queue, two Rx free-buffer queues, and up to 3
2557  *      Tx queues.  The Tx queues are assigned roles in the order Ethernet
2558  *      queue, offload queue, and control queue.
2559  */
2560 int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
2561                       int irq_vec_idx, const struct qset_params *p,
2562                       int ntxq, struct net_device *dev)
2563 {
2564         int i, ret = -ENOMEM;
2565         struct sge_qset *q = &adapter->sge.qs[id];
2566
2567         init_qset_cntxt(q, id);
2568         init_timer(&q->tx_reclaim_timer);
2569         q->tx_reclaim_timer.data = (unsigned long)q;
2570         q->tx_reclaim_timer.function = sge_timer_cb;
2571
2572         q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
2573                                    sizeof(struct rx_desc),
2574                                    sizeof(struct rx_sw_desc),
2575                                    &q->fl[0].phys_addr, &q->fl[0].sdesc);
2576         if (!q->fl[0].desc)
2577                 goto err;
2578
2579         q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
2580                                    sizeof(struct rx_desc),
2581                                    sizeof(struct rx_sw_desc),
2582                                    &q->fl[1].phys_addr, &q->fl[1].sdesc);
2583         if (!q->fl[1].desc)
2584                 goto err;
2585
2586         q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
2587                                   sizeof(struct rsp_desc), 0,
2588                                   &q->rspq.phys_addr, NULL);
2589         if (!q->rspq.desc)
2590                 goto err;
2591
2592         for (i = 0; i < ntxq; ++i) {
2593                 /*
2594                  * The control queue always uses immediate data so does not
2595                  * need to keep track of any sk_buffs.
2596                  */
2597                 size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2598
2599                 q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
2600                                             sizeof(struct tx_desc), sz,
2601                                             &q->txq[i].phys_addr,
2602                                             &q->txq[i].sdesc);
2603                 if (!q->txq[i].desc)
2604                         goto err;
2605
2606                 q->txq[i].gen = 1;
2607                 q->txq[i].size = p->txq_size[i];
2608                 spin_lock_init(&q->txq[i].lock);
2609                 skb_queue_head_init(&q->txq[i].sendq);
2610         }
2611
2612         tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq,
2613                      (unsigned long)q);
2614         tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq,
2615                      (unsigned long)q);
2616
2617         q->fl[0].gen = q->fl[1].gen = 1;
2618         q->fl[0].size = p->fl_size;
2619         q->fl[1].size = p->jumbo_size;
2620
2621         q->rspq.gen = 1;
2622         q->rspq.size = p->rspq_size;
2623         spin_lock_init(&q->rspq.lock);
2624
2625         q->txq[TXQ_ETH].stop_thres = nports *
2626             flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
2627
2628 #if FL0_PG_CHUNK_SIZE > 0
2629         q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
2630 #else
2631         q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
2632 #endif
2633         q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
2634         q->fl[1].buf_size = is_offload(adapter) ?
2635                 (16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
2636                 MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
2637
2638         spin_lock(&adapter->sge.reg_lock);
2639
2640         /* FL threshold comparison uses < */
2641         ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
2642                                    q->rspq.phys_addr, q->rspq.size,
2643                                    q->fl[0].buf_size, 1, 0);
2644         if (ret)
2645                 goto err_unlock;
2646
2647         for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
2648                 ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
2649                                           q->fl[i].phys_addr, q->fl[i].size,
2650                                           q->fl[i].buf_size, p->cong_thres, 1,
2651                                           0);
2652                 if (ret)
2653                         goto err_unlock;
2654         }
2655
2656         ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
2657                                  SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
2658                                  q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
2659                                  1, 0);
2660         if (ret)
2661                 goto err_unlock;
2662
2663         if (ntxq > 1) {
2664                 ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
2665                                          USE_GTS, SGE_CNTXT_OFLD, id,
2666                                          q->txq[TXQ_OFLD].phys_addr,
2667                                          q->txq[TXQ_OFLD].size, 0, 1, 0);
2668                 if (ret)
2669                         goto err_unlock;
2670         }
2671
2672         if (ntxq > 2) {
2673                 ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
2674                                          SGE_CNTXT_CTRL, id,
2675                                          q->txq[TXQ_CTRL].phys_addr,
2676                                          q->txq[TXQ_CTRL].size,
2677                                          q->txq[TXQ_CTRL].token, 1, 0);
2678                 if (ret)
2679                         goto err_unlock;
2680         }
2681
2682         spin_unlock(&adapter->sge.reg_lock);
2683
2684         q->adap = adapter;
2685         q->netdev = dev;
2686         t3_update_qset_coalesce(q, p);
2687
2688         refill_fl(adapter, &q->fl[0], q->fl[0].size, GFP_KERNEL);
2689         refill_fl(adapter, &q->fl[1], q->fl[1].size, GFP_KERNEL);
2690         refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
2691
2692         t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
2693                      V_NEWTIMER(q->rspq.holdoff_tmr));
2694
2695         mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
2696         return 0;
2697
2698       err_unlock:
2699         spin_unlock(&adapter->sge.reg_lock);
2700       err:
2701         t3_free_qset(adapter, q);
2702         return ret;
2703 }
2704
2705 /**
2706  *      t3_free_sge_resources - free SGE resources
2707  *      @adap: the adapter
2708  *
2709  *      Frees resources used by the SGE queue sets.
2710  */
2711 void t3_free_sge_resources(struct adapter *adap)
2712 {
2713         int i;
2714
2715         for (i = 0; i < SGE_QSETS; ++i)
2716                 t3_free_qset(adap, &adap->sge.qs[i]);
2717 }
2718
2719 /**
2720  *      t3_sge_start - enable SGE
2721  *      @adap: the adapter
2722  *
2723  *      Enables the SGE for DMAs.  This is the last step in starting packet
2724  *      transfers.
2725  */
2726 void t3_sge_start(struct adapter *adap)
2727 {
2728         t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
2729 }
2730
2731 /**
2732  *      t3_sge_stop - disable SGE operation
2733  *      @adap: the adapter
2734  *
2735  *      Disables the DMA engine.  This can be called in emeregencies (e.g.,
2736  *      from error interrupts) or from normal process context.  In the latter
2737  *      case it also disables any pending queue restart tasklets.  Note that
2738  *      if it is called in interrupt context it cannot disable the restart
2739  *      tasklets as it cannot wait, however the tasklets will have no effect
2740  *      since the doorbells are disabled and the driver will call this again
2741  *      later from process context, at which time the tasklets will be stopped
2742  *      if they are still running.
2743  */
2744 void t3_sge_stop(struct adapter *adap)
2745 {
2746         t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
2747         if (!in_interrupt()) {
2748                 int i;
2749
2750                 for (i = 0; i < SGE_QSETS; ++i) {
2751                         struct sge_qset *qs = &adap->sge.qs[i];
2752
2753                         tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk);
2754                         tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk);
2755                 }
2756         }
2757 }
2758
2759 /**
2760  *      t3_sge_init - initialize SGE
2761  *      @adap: the adapter
2762  *      @p: the SGE parameters
2763  *
2764  *      Performs SGE initialization needed every time after a chip reset.
2765  *      We do not initialize any of the queue sets here, instead the driver
2766  *      top-level must request those individually.  We also do not enable DMA
2767  *      here, that should be done after the queues have been set up.
2768  */
2769 void t3_sge_init(struct adapter *adap, struct sge_params *p)
2770 {
2771         unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
2772
2773         ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
2774             F_CQCRDTCTRL |
2775             V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
2776             V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
2777 #if SGE_NUM_GENBITS == 1
2778         ctrl |= F_EGRGENCTRL;
2779 #endif
2780         if (adap->params.rev > 0) {
2781                 if (!(adap->flags & (USING_MSIX | USING_MSI)))
2782                         ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
2783                 ctrl |= F_CQCRDTCTRL | F_AVOIDCQOVFL;
2784         }
2785         t3_write_reg(adap, A_SG_CONTROL, ctrl);
2786         t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
2787                      V_LORCQDRBTHRSH(512));
2788         t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
2789         t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
2790                      V_TIMEOUT(200 * core_ticks_per_usec(adap)));
2791         t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH, 1000);
2792         t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
2793         t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
2794         t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
2795         t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
2796         t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
2797 }
2798
2799 /**
2800  *      t3_sge_prep - one-time SGE initialization
2801  *      @adap: the associated adapter
2802  *      @p: SGE parameters
2803  *
2804  *      Performs one-time initialization of SGE SW state.  Includes determining
2805  *      defaults for the assorted SGE parameters, which admins can change until
2806  *      they are used to initialize the SGE.
2807  */
2808 void __devinit t3_sge_prep(struct adapter *adap, struct sge_params *p)
2809 {
2810         int i;
2811
2812         p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
2813             SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
2814
2815         for (i = 0; i < SGE_QSETS; ++i) {
2816                 struct qset_params *q = p->qset + i;
2817
2818                 q->polling = adap->params.rev > 0;
2819                 q->coalesce_usecs = 5;
2820                 q->rspq_size = 1024;
2821                 q->fl_size = 1024;
2822                 q->jumbo_size = 512;
2823                 q->txq_size[TXQ_ETH] = 1024;
2824                 q->txq_size[TXQ_OFLD] = 1024;
2825                 q->txq_size[TXQ_CTRL] = 256;
2826                 q->cong_thres = 0;
2827         }
2828
2829         spin_lock_init(&adap->sge.reg_lock);
2830 }
2831
2832 /**
2833  *      t3_get_desc - dump an SGE descriptor for debugging purposes
2834  *      @qs: the queue set
2835  *      @qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
2836  *      @idx: the descriptor index in the queue
2837  *      @data: where to dump the descriptor contents
2838  *
2839  *      Dumps the contents of a HW descriptor of an SGE queue.  Returns the
2840  *      size of the descriptor.
2841  */
2842 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
2843                 unsigned char *data)
2844 {
2845         if (qnum >= 6)
2846                 return -EINVAL;
2847
2848         if (qnum < 3) {
2849                 if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
2850                         return -EINVAL;
2851                 memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
2852                 return sizeof(struct tx_desc);
2853         }
2854
2855         if (qnum == 3) {
2856                 if (!qs->rspq.desc || idx >= qs->rspq.size)
2857                         return -EINVAL;
2858                 memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
2859                 return sizeof(struct rsp_desc);
2860         }
2861
2862         qnum -= 4;
2863         if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
2864                 return -EINVAL;
2865         memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
2866         return sizeof(struct rx_desc);
2867 }