The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/cxgbe/tom/t4_tom.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2012 Chelsio Communications, Inc.
    5  * All rights reserved.
    6  * Written by: Navdeep Parhar <np@FreeBSD.org>
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD$");
   32 
   33 #include "opt_inet.h"
   34 #include "opt_inet6.h"
   35 #include "opt_kern_tls.h"
   36 #include "opt_ratelimit.h"
   37 
   38 #include <sys/param.h>
   39 #include <sys/types.h>
   40 #include <sys/systm.h>
   41 #include <sys/kernel.h>
   42 #include <sys/ktr.h>
   43 #include <sys/lock.h>
   44 #include <sys/limits.h>
   45 #include <sys/module.h>
   46 #include <sys/protosw.h>
   47 #include <sys/domain.h>
   48 #include <sys/refcount.h>
   49 #include <sys/rmlock.h>
   50 #include <sys/socket.h>
   51 #include <sys/socketvar.h>
   52 #include <sys/sysctl.h>
   53 #include <sys/taskqueue.h>
   54 #include <net/if.h>
   55 #include <net/if_var.h>
   56 #include <net/if_types.h>
   57 #include <net/if_vlan_var.h>
   58 #include <netinet/in.h>
   59 #include <netinet/in_pcb.h>
   60 #include <netinet/in_var.h>
   61 #include <netinet/ip.h>
   62 #include <netinet/ip6.h>
   63 #include <netinet6/scope6_var.h>
   64 #define TCPSTATES
   65 #include <netinet/tcp_fsm.h>
   66 #include <netinet/tcp_seq.h>
   67 #include <netinet/tcp_timer.h>
   68 #include <netinet/tcp_var.h>
   69 #include <netinet/toecore.h>
   70 #include <netinet/cc/cc.h>
   71 
   72 #ifdef TCP_OFFLOAD
   73 #include "common/common.h"
   74 #include "common/t4_msg.h"
   75 #include "common/t4_regs.h"
   76 #include "common/t4_regs_values.h"
   77 #include "common/t4_tcb.h"
   78 #include "t4_clip.h"
   79 #include "tom/t4_tom_l2t.h"
   80 #include "tom/t4_tom.h"
   81 #include "tom/t4_tls.h"
   82 
   83 static struct protosw toe_protosw;
   84 static struct protosw toe6_protosw;
   85 
   86 /* Module ops */
   87 static int t4_tom_mod_load(void);
   88 static int t4_tom_mod_unload(void);
   89 static int t4_tom_modevent(module_t, int, void *);
   90 
   91 /* ULD ops and helpers */
   92 static int t4_tom_activate(struct adapter *);
   93 static int t4_tom_deactivate(struct adapter *);
   94 
   95 static struct uld_info tom_uld_info = {
   96         .uld_id = ULD_TOM,
   97         .activate = t4_tom_activate,
   98         .deactivate = t4_tom_deactivate,
   99 };
  100 
  101 static void release_offload_resources(struct toepcb *);
  102 static int alloc_tid_tabs(struct tid_info *);
  103 static void free_tid_tabs(struct tid_info *);
  104 static void free_tom_data(struct adapter *, struct tom_data *);
  105 static void reclaim_wr_resources(void *, int);
  106 
  107 struct toepcb *
  108 alloc_toepcb(struct vi_info *vi, int flags)
  109 {
  110         struct port_info *pi = vi->pi;
  111         struct adapter *sc = pi->adapter;
  112         struct toepcb *toep;
  113         int tx_credits, txsd_total, len;
  114 
  115         /*
  116          * The firmware counts tx work request credits in units of 16 bytes
  117          * each.  Reserve room for an ABORT_REQ so the driver never has to worry
  118          * about tx credits if it wants to abort a connection.
  119          */
  120         tx_credits = sc->params.ofldq_wr_cred;
  121         tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
  122 
  123         /*
  124          * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
  125          * immediate payload, and firmware counts tx work request credits in
  126          * units of 16 byte.  Calculate the maximum work requests possible.
  127          */
  128         txsd_total = tx_credits /
  129             howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
  130 
  131         len = offsetof(struct toepcb, txsd) +
  132             txsd_total * sizeof(struct ofld_tx_sdesc);
  133 
  134         toep = malloc(len, M_CXGBE, M_ZERO | flags);
  135         if (toep == NULL)
  136                 return (NULL);
  137 
  138         refcount_init(&toep->refcount, 1);
  139         toep->td = sc->tom_softc;
  140         toep->vi = vi;
  141         toep->tid = -1;
  142         toep->tx_total = tx_credits;
  143         toep->tx_credits = tx_credits;
  144         mbufq_init(&toep->ulp_pduq, INT_MAX);
  145         mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
  146         toep->txsd_total = txsd_total;
  147         toep->txsd_avail = txsd_total;
  148         toep->txsd_pidx = 0;
  149         toep->txsd_cidx = 0;
  150         aiotx_init_toep(toep);
  151 
  152         return (toep);
  153 }
  154 
  155 /*
  156  * Initialize a toepcb after its params have been filled out.
  157  */
  158 int
  159 init_toepcb(struct vi_info *vi, struct toepcb *toep)
  160 {
  161         struct conn_params *cp = &toep->params;
  162         struct port_info *pi = vi->pi;
  163         struct adapter *sc = pi->adapter;
  164         struct tx_cl_rl_params *tc;
  165 
  166         if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) {
  167                 tc = &pi->sched_params->cl_rl[cp->tc_idx];
  168                 mtx_lock(&sc->tc_lock);
  169                 if (tc->state != CS_HW_CONFIGURED) {
  170                         CH_ERR(vi, "tid %d cannot be bound to traffic class %d "
  171                             "because it is not configured (its state is %d)\n",
  172                             toep->tid, cp->tc_idx, tc->state);
  173                         cp->tc_idx = -1;
  174                 } else {
  175                         tc->refcount++;
  176                 }
  177                 mtx_unlock(&sc->tc_lock);
  178         }
  179         toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
  180         toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx];
  181         toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
  182 
  183         tls_init_toep(toep);
  184         if (ulp_mode(toep) == ULP_MODE_TCPDDP)
  185                 ddp_init_toep(toep);
  186 
  187         toep->flags |= TPF_INITIALIZED;
  188 
  189         return (0);
  190 }
  191 
  192 struct toepcb *
  193 hold_toepcb(struct toepcb *toep)
  194 {
  195 
  196         refcount_acquire(&toep->refcount);
  197         return (toep);
  198 }
  199 
  200 void
  201 free_toepcb(struct toepcb *toep)
  202 {
  203 
  204         if (refcount_release(&toep->refcount) == 0)
  205                 return;
  206 
  207         KASSERT(!(toep->flags & TPF_ATTACHED),
  208             ("%s: attached to an inpcb", __func__));
  209         KASSERT(!(toep->flags & TPF_CPL_PENDING),
  210             ("%s: CPL pending", __func__));
  211 
  212         if (toep->flags & TPF_INITIALIZED) {
  213                 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
  214                         ddp_uninit_toep(toep);
  215                 tls_uninit_toep(toep);
  216         }
  217         free(toep, M_CXGBE);
  218 }
  219 
  220 /*
  221  * Set up the socket for TCP offload.
  222  */
  223 void
  224 offload_socket(struct socket *so, struct toepcb *toep)
  225 {
  226         struct tom_data *td = toep->td;
  227         struct inpcb *inp = sotoinpcb(so);
  228         struct tcpcb *tp = intotcpcb(inp);
  229         struct sockbuf *sb;
  230 
  231         INP_WLOCK_ASSERT(inp);
  232 
  233         /* Update socket */
  234         sb = &so->so_snd;
  235         SOCKBUF_LOCK(sb);
  236         sb->sb_flags |= SB_NOCOALESCE;
  237         SOCKBUF_UNLOCK(sb);
  238         sb = &so->so_rcv;
  239         SOCKBUF_LOCK(sb);
  240         sb->sb_flags |= SB_NOCOALESCE;
  241         if (inp->inp_vflag & INP_IPV6)
  242                 so->so_proto = &toe6_protosw;
  243         else
  244                 so->so_proto = &toe_protosw;
  245         SOCKBUF_UNLOCK(sb);
  246 
  247         /* Update TCP PCB */
  248         tp->tod = &td->tod;
  249         tp->t_toe = toep;
  250         tp->t_flags |= TF_TOE;
  251 
  252         /* Install an extra hold on inp */
  253         toep->inp = inp;
  254         toep->flags |= TPF_ATTACHED;
  255         in_pcbref(inp);
  256 
  257         /* Add the TOE PCB to the active list */
  258         mtx_lock(&td->toep_list_lock);
  259         TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
  260         mtx_unlock(&td->toep_list_lock);
  261 }
  262 
  263 void
  264 restore_so_proto(struct socket *so, bool v6)
  265 {
  266         if (v6)
  267                 so->so_proto = &tcp6_protosw;
  268         else
  269                 so->so_proto = &tcp_protosw;
  270 }
  271 
  272 /* This is _not_ the normal way to "unoffload" a socket. */
  273 void
  274 undo_offload_socket(struct socket *so)
  275 {
  276         struct inpcb *inp = sotoinpcb(so);
  277         struct tcpcb *tp = intotcpcb(inp);
  278         struct toepcb *toep = tp->t_toe;
  279         struct tom_data *td = toep->td;
  280         struct sockbuf *sb;
  281 
  282         INP_WLOCK_ASSERT(inp);
  283 
  284         sb = &so->so_snd;
  285         SOCKBUF_LOCK(sb);
  286         sb->sb_flags &= ~SB_NOCOALESCE;
  287         SOCKBUF_UNLOCK(sb);
  288         sb = &so->so_rcv;
  289         SOCKBUF_LOCK(sb);
  290         sb->sb_flags &= ~SB_NOCOALESCE;
  291         restore_so_proto(so, inp->inp_vflag & INP_IPV6);
  292         SOCKBUF_UNLOCK(sb);
  293 
  294         tp->tod = NULL;
  295         tp->t_toe = NULL;
  296         tp->t_flags &= ~TF_TOE;
  297 
  298         toep->inp = NULL;
  299         toep->flags &= ~TPF_ATTACHED;
  300         if (in_pcbrele_wlocked(inp))
  301                 panic("%s: inp freed.", __func__);
  302 
  303         mtx_lock(&td->toep_list_lock);
  304         TAILQ_REMOVE(&td->toep_list, toep, link);
  305         mtx_unlock(&td->toep_list_lock);
  306 }
  307 
  308 static void
  309 release_offload_resources(struct toepcb *toep)
  310 {
  311         struct tom_data *td = toep->td;
  312         struct adapter *sc = td_adapter(td);
  313         int tid = toep->tid;
  314 
  315         KASSERT(!(toep->flags & TPF_CPL_PENDING),
  316             ("%s: %p has CPL pending.", __func__, toep));
  317         KASSERT(!(toep->flags & TPF_ATTACHED),
  318             ("%s: %p is still attached.", __func__, toep));
  319 
  320         CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
  321             __func__, toep, tid, toep->l2te, toep->ce);
  322 
  323         /*
  324          * These queues should have been emptied at approximately the same time
  325          * that a normal connection's socket's so_snd would have been purged or
  326          * drained.  Do _not_ clean up here.
  327          */
  328         MPASS(mbufq_len(&toep->ulp_pduq) == 0);
  329         MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
  330 #ifdef INVARIANTS
  331         if (ulp_mode(toep) == ULP_MODE_TCPDDP)
  332                 ddp_assert_empty(toep);
  333 #endif
  334         MPASS(TAILQ_EMPTY(&toep->aiotx_jobq));
  335 
  336         if (toep->l2te)
  337                 t4_l2t_release(toep->l2te);
  338 
  339         if (tid >= 0) {
  340                 remove_tid(sc, tid, toep->ce ? 2 : 1);
  341                 release_tid(sc, tid, toep->ctrlq);
  342         }
  343 
  344         if (toep->ce)
  345                 t4_release_clip_entry(sc, toep->ce);
  346 
  347         if (toep->params.tc_idx != -1)
  348                 t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx);
  349 
  350         mtx_lock(&td->toep_list_lock);
  351         TAILQ_REMOVE(&td->toep_list, toep, link);
  352         mtx_unlock(&td->toep_list_lock);
  353 
  354         free_toepcb(toep);
  355 }
  356 
  357 /*
  358  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  359  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  360  * pending CPL) then it is time to release all resources tied to the toepcb.
  361  *
  362  * Also gets called when an offloaded active open fails and the TOM wants the
  363  * kernel to take the TCP PCB back.
  364  */
  365 static void
  366 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
  367 {
  368 #if defined(KTR) || defined(INVARIANTS)
  369         struct inpcb *inp = tptoinpcb(tp);
  370 #endif
  371         struct toepcb *toep = tp->t_toe;
  372 
  373         INP_WLOCK_ASSERT(inp);
  374 
  375         KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
  376         KASSERT(toep->flags & TPF_ATTACHED,
  377             ("%s: not attached", __func__));
  378 
  379 #ifdef KTR
  380         if (tp->t_state == TCPS_SYN_SENT) {
  381                 CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
  382                     __func__, toep->tid, toep, toep->flags, inp,
  383                     inp->inp_flags);
  384         } else {
  385                 CTR6(KTR_CXGBE,
  386                     "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
  387                     toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
  388                     inp->inp_flags);
  389         }
  390 #endif
  391 
  392         tp->tod = NULL;
  393         tp->t_toe = NULL;
  394         tp->t_flags &= ~TF_TOE;
  395         toep->flags &= ~TPF_ATTACHED;
  396 
  397         if (!(toep->flags & TPF_CPL_PENDING))
  398                 release_offload_resources(toep);
  399 }
  400 
  401 /*
  402  * setsockopt handler.
  403  */
  404 static void
  405 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
  406 {
  407         struct adapter *sc = tod->tod_softc;
  408         struct toepcb *toep = tp->t_toe;
  409 
  410         if (dir == SOPT_GET)
  411                 return;
  412 
  413         CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
  414 
  415         switch (name) {
  416         case TCP_NODELAY:
  417                 if (tp->t_state != TCPS_ESTABLISHED)
  418                         break;
  419                 toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
  420                 t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
  421                     V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0);
  422                 break;
  423         default:
  424                 break;
  425         }
  426 }
  427 
  428 static inline uint64_t
  429 get_tcb_tflags(const uint64_t *tcb)
  430 {
  431 
  432         return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32));
  433 }
  434 
  435 static inline uint32_t
  436 get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift)
  437 {
  438 #define LAST_WORD ((TCB_SIZE / 4) - 1)
  439         uint64_t t1, t2;
  440         int flit_idx;
  441 
  442         MPASS(mask != 0);
  443         MPASS(word <= LAST_WORD);
  444         MPASS(shift < 32);
  445 
  446         flit_idx = (LAST_WORD - word) / 2;
  447         if (word & 0x1)
  448                 shift += 32;
  449         t1 = be64toh(tcb[flit_idx]) >> shift;
  450         t2 = 0;
  451         if (fls(mask) > 64 - shift) {
  452                 /*
  453                  * Will spill over into the next logical flit, which is the flit
  454                  * before this one.  The flit_idx before this one must be valid.
  455                  */
  456                 MPASS(flit_idx > 0);
  457                 t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift);
  458         }
  459         return ((t2 | t1) & mask);
  460 #undef LAST_WORD
  461 }
  462 #define GET_TCB_FIELD(tcb, F) \
  463     get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F)
  464 
  465 /*
  466  * Issues a CPL_GET_TCB to read the entire TCB for the tid.
  467  */
  468 static int
  469 send_get_tcb(struct adapter *sc, u_int tid)
  470 {
  471         struct cpl_get_tcb *cpl;
  472         struct wrq_cookie cookie;
  473 
  474         MPASS(tid >= sc->tids.tid_base);
  475         MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
  476 
  477         cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16),
  478             &cookie);
  479         if (__predict_false(cpl == NULL))
  480                 return (ENOMEM);
  481         bzero(cpl, sizeof(*cpl));
  482         INIT_TP_WR(cpl, tid);
  483         OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
  484         cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
  485             V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
  486         cpl->cookie = 0xff;
  487         commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
  488 
  489         return (0);
  490 }
  491 
  492 static struct tcb_histent *
  493 alloc_tcb_histent(struct adapter *sc, u_int tid, int flags)
  494 {
  495         struct tcb_histent *te;
  496 
  497         MPASS(flags == M_NOWAIT || flags == M_WAITOK);
  498 
  499         te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags);
  500         if (te == NULL)
  501                 return (NULL);
  502         mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF);
  503         callout_init_mtx(&te->te_callout, &te->te_lock, 0);
  504         te->te_adapter = sc;
  505         te->te_tid = tid;
  506 
  507         return (te);
  508 }
  509 
  510 static void
  511 free_tcb_histent(struct tcb_histent *te)
  512 {
  513 
  514         mtx_destroy(&te->te_lock);
  515         free(te, M_CXGBE);
  516 }
  517 
  518 /*
  519  * Start tracking the tid in the TCB history.
  520  */
  521 int
  522 add_tid_to_history(struct adapter *sc, u_int tid)
  523 {
  524         struct tcb_histent *te = NULL;
  525         struct tom_data *td = sc->tom_softc;
  526         int rc;
  527 
  528         MPASS(tid >= sc->tids.tid_base);
  529         MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
  530 
  531         if (td->tcb_history == NULL)
  532                 return (ENXIO);
  533 
  534         rw_wlock(&td->tcb_history_lock);
  535         if (td->tcb_history[tid] != NULL) {
  536                 rc = EEXIST;
  537                 goto done;
  538         }
  539         te = alloc_tcb_histent(sc, tid, M_NOWAIT);
  540         if (te == NULL) {
  541                 rc = ENOMEM;
  542                 goto done;
  543         }
  544         mtx_lock(&te->te_lock);
  545         rc = send_get_tcb(sc, tid);
  546         if (rc == 0) {
  547                 te->te_flags |= TE_RPL_PENDING;
  548                 td->tcb_history[tid] = te;
  549         } else {
  550                 free(te, M_CXGBE);
  551         }
  552         mtx_unlock(&te->te_lock);
  553 done:
  554         rw_wunlock(&td->tcb_history_lock);
  555         return (rc);
  556 }
  557 
  558 static void
  559 remove_tcb_histent(struct tcb_histent *te)
  560 {
  561         struct adapter *sc = te->te_adapter;
  562         struct tom_data *td = sc->tom_softc;
  563 
  564         rw_assert(&td->tcb_history_lock, RA_WLOCKED);
  565         mtx_assert(&te->te_lock, MA_OWNED);
  566         MPASS(td->tcb_history[te->te_tid] == te);
  567 
  568         td->tcb_history[te->te_tid] = NULL;
  569         free_tcb_histent(te);
  570         rw_wunlock(&td->tcb_history_lock);
  571 }
  572 
  573 static inline struct tcb_histent *
  574 lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem)
  575 {
  576         struct tcb_histent *te;
  577         struct tom_data *td = sc->tom_softc;
  578 
  579         MPASS(tid >= sc->tids.tid_base);
  580         MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
  581 
  582         if (td->tcb_history == NULL)
  583                 return (NULL);
  584 
  585         if (addrem)
  586                 rw_wlock(&td->tcb_history_lock);
  587         else
  588                 rw_rlock(&td->tcb_history_lock);
  589         te = td->tcb_history[tid];
  590         if (te != NULL) {
  591                 mtx_lock(&te->te_lock);
  592                 return (te);    /* with both locks held */
  593         }
  594         if (addrem)
  595                 rw_wunlock(&td->tcb_history_lock);
  596         else
  597                 rw_runlock(&td->tcb_history_lock);
  598 
  599         return (te);
  600 }
  601 
  602 static inline void
  603 release_tcb_histent(struct tcb_histent *te)
  604 {
  605         struct adapter *sc = te->te_adapter;
  606         struct tom_data *td = sc->tom_softc;
  607 
  608         mtx_assert(&te->te_lock, MA_OWNED);
  609         mtx_unlock(&te->te_lock);
  610         rw_assert(&td->tcb_history_lock, RA_RLOCKED);
  611         rw_runlock(&td->tcb_history_lock);
  612 }
  613 
  614 static void
  615 request_tcb(void *arg)
  616 {
  617         struct tcb_histent *te = arg;
  618 
  619         mtx_assert(&te->te_lock, MA_OWNED);
  620 
  621         /* Noone else is supposed to update the histent. */
  622         MPASS(!(te->te_flags & TE_RPL_PENDING));
  623         if (send_get_tcb(te->te_adapter, te->te_tid) == 0)
  624                 te->te_flags |= TE_RPL_PENDING;
  625         else
  626                 callout_schedule(&te->te_callout, hz / 100);
  627 }
  628 
  629 static void
  630 update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb)
  631 {
  632         struct tom_data *td = te->te_adapter->tom_softc;
  633         uint64_t tflags = get_tcb_tflags(tcb);
  634         uint8_t sample = 0;
  635 
  636         if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) {
  637                 if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0)
  638                         sample |= TS_RTO;
  639                 if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0)
  640                         sample |= TS_DUPACKS;
  641                 if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold)
  642                         sample |= TS_FASTREXMT;
  643         }
  644 
  645         if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) {
  646                 uint32_t snd_wnd;
  647 
  648                 sample |= TS_SND_BACKLOGGED;    /* for whatever reason. */
  649 
  650                 snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
  651                 if (tflags & V_TF_RECV_SCALE(1))
  652                         snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE);
  653                 if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd)
  654                         sample |= TS_CWND_LIMITED;      /* maybe due to CWND */
  655         }
  656 
  657         if (tflags & V_TF_CCTRL_ECN(1)) {
  658 
  659                 /*
  660                  * CE marker on incoming IP hdr, echoing ECE back in the TCP
  661                  * hdr.  Indicates congestion somewhere on the way from the peer
  662                  * to this node.
  663                  */
  664                 if (tflags & V_TF_CCTRL_ECE(1))
  665                         sample |= TS_ECN_ECE;
  666 
  667                 /*
  668                  * ECE seen and CWR sent (or about to be sent).  Might indicate
  669                  * congestion on the way to the peer.  This node is reducing its
  670                  * congestion window in response.
  671                  */
  672                 if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1)))
  673                         sample |= TS_ECN_CWR;
  674         }
  675 
  676         te->te_sample[te->te_pidx] = sample;
  677         if (++te->te_pidx == nitems(te->te_sample))
  678                 te->te_pidx = 0;
  679         memcpy(te->te_tcb, tcb, TCB_SIZE);
  680         te->te_flags |= TE_ACTIVE;
  681 }
  682 
  683 static int
  684 do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
  685 {
  686         struct adapter *sc = iq->adapter;
  687         const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *);
  688         const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1);
  689         struct tcb_histent *te;
  690         const u_int tid = GET_TID(cpl);
  691         bool remove;
  692 
  693         remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED;
  694         te = lookup_tcb_histent(sc, tid, remove);
  695         if (te == NULL) {
  696                 /* Not in the history.  Who issued the GET_TCB for this? */
  697                 device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, "
  698                     "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid,
  699                     (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE),
  700                     GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE),
  701                     GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie);
  702                 goto done;
  703         }
  704 
  705         MPASS(te->te_flags & TE_RPL_PENDING);
  706         te->te_flags &= ~TE_RPL_PENDING;
  707         if (remove) {
  708                 remove_tcb_histent(te);
  709         } else {
  710                 update_tcb_histent(te, tcb);
  711                 callout_reset(&te->te_callout, hz / 10, request_tcb, te);
  712                 release_tcb_histent(te);
  713         }
  714 done:
  715         m_freem(m);
  716         return (0);
  717 }
  718 
  719 static void
  720 fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti)
  721 {
  722         uint32_t v;
  723 
  724         ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE);
  725 
  726         v = GET_TCB_FIELD(tcb, T_SRTT);
  727         ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
  728 
  729         v = GET_TCB_FIELD(tcb, T_RTTVAR);
  730         ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
  731 
  732         ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH);
  733         ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND);
  734         ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT);
  735 
  736         v = GET_TCB_FIELD(tcb, TX_MAX);
  737         ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW);
  738 
  739         /* Receive window being advertised by us. */
  740         ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE);    /* Yes, SND. */
  741         ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND);
  742 
  743         /* Send window */
  744         ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE);    /* Yes, RCV. */
  745         ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
  746         if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1))
  747                 ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale;
  748         else
  749                 ti->tcpi_snd_wscale = 0;
  750 
  751 }
  752 
  753 static void
  754 fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te,
  755     struct tcp_info *ti)
  756 {
  757 
  758         fill_tcp_info_from_tcb(sc, te->te_tcb, ti);
  759 }
  760 
  761 /*
  762  * Reads the TCB for the given tid using a memory window and copies it to 'buf'
  763  * in the same format as CPL_GET_TCB_RPL.
  764  */
  765 static void
  766 read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf)
  767 {
  768         int i, j, k, rc;
  769         uint32_t addr;
  770         u_char *tcb, tmp;
  771 
  772         MPASS(tid >= sc->tids.tid_base);
  773         MPASS(tid - sc->tids.tid_base < sc->tids.ntids);
  774 
  775         addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
  776         rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE);
  777         if (rc != 0)
  778                 return;
  779 
  780         tcb = (u_char *)buf;
  781         for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
  782                 for (k = 0; k < 16; k++) {
  783                         tmp = tcb[i + k];
  784                         tcb[i + k] = tcb[j + k];
  785                         tcb[j + k] = tmp;
  786                 }
  787         }
  788 }
  789 
  790 static void
  791 fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti)
  792 {
  793         uint64_t tcb[TCB_SIZE / sizeof(uint64_t)];
  794         struct tcb_histent *te;
  795 
  796         ti->tcpi_toe_tid = tid;
  797         te = lookup_tcb_histent(sc, tid, false);
  798         if (te != NULL) {
  799                 fill_tcp_info_from_history(sc, te, ti);
  800                 release_tcb_histent(te);
  801         } else {
  802                 if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) {
  803                         /* XXX: tell firmware to flush TCB cache. */
  804                 }
  805                 read_tcb_using_memwin(sc, tid, tcb);
  806                 fill_tcp_info_from_tcb(sc, tcb, ti);
  807         }
  808 }
  809 
  810 /*
  811  * Called by the kernel to allow the TOE driver to "refine" values filled up in
  812  * the tcp_info for an offloaded connection.
  813  */
  814 static void
  815 t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti)
  816 {
  817         struct adapter *sc = tod->tod_softc;
  818         struct toepcb *toep = tp->t_toe;
  819 
  820         INP_WLOCK_ASSERT(tptoinpcb(tp));
  821         MPASS(ti != NULL);
  822 
  823         fill_tcp_info(sc, toep->tid, ti);
  824 }
  825 
  826 #ifdef KERN_TLS
  827 static int
  828 t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp,
  829     struct ktls_session *tls, int direction)
  830 {
  831         struct toepcb *toep = tp->t_toe;
  832 
  833         INP_WLOCK_ASSERT(tptoinpcb(tp));
  834         MPASS(tls != NULL);
  835 
  836         return (tls_alloc_ktls(toep, tls, direction));
  837 }
  838 #endif
  839 
  840 /* SET_TCB_FIELD sent as a ULP command looks like this */
  841 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
  842     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
  843 
  844 static void *
  845 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, uint64_t word, uint64_t mask,
  846                 uint64_t val, uint32_t tid)
  847 {
  848         struct ulptx_idata *ulpsc;
  849         struct cpl_set_tcb_field_core *req;
  850 
  851         ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
  852         ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
  853 
  854         ulpsc = (struct ulptx_idata *)(ulpmc + 1);
  855         ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
  856         ulpsc->len = htobe32(sizeof(*req));
  857 
  858         req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
  859         OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
  860         req->reply_ctrl = htobe16(V_NO_REPLY(1));
  861         req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
  862         req->mask = htobe64(mask);
  863         req->val = htobe64(val);
  864 
  865         ulpsc = (struct ulptx_idata *)(req + 1);
  866         if (LEN__SET_TCB_FIELD_ULP % 16) {
  867                 ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
  868                 ulpsc->len = htobe32(0);
  869                 return (ulpsc + 1);
  870         }
  871         return (ulpsc);
  872 }
  873 
  874 static void
  875 send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep)
  876 {
  877         struct wrq_cookie cookie;
  878         struct fw_flowc_wr *flowc;
  879         struct ofld_tx_sdesc *txsd;
  880         const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval);
  881         const int flowclen16 = howmany(flowclen, 16);
  882 
  883         if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) {
  884                 CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__,
  885                     toep->tid, toep->tx_credits, toep->txsd_avail);
  886                 return;
  887         }
  888 
  889         flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie);
  890         if (__predict_false(flowc == NULL)) {
  891                 CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid);
  892                 return;
  893         }
  894         flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
  895             V_FW_FLOWC_WR_NPARAMS(1));
  896         flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
  897             V_FW_WR_FLOWID(toep->tid));
  898         flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS;
  899         flowc->mnemval[0].val = htobe32(toep->params.emss);
  900 
  901         txsd = &toep->txsd[toep->txsd_pidx];
  902         txsd->tx_credits = flowclen16;
  903         txsd->plen = 0;
  904         toep->tx_credits -= txsd->tx_credits;
  905         if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
  906                 toep->txsd_pidx = 0;
  907         toep->txsd_avail--;
  908         commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie);
  909 }
  910 
  911 static void
  912 t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu)
  913 {
  914         struct work_request_hdr *wrh;
  915         struct ulp_txpkt *ulpmc;
  916         int idx, len;
  917         struct wrq_cookie cookie;
  918         struct inpcb *inp = tptoinpcb(tp);
  919         struct toepcb *toep = tp->t_toe;
  920         struct adapter *sc = td_adapter(toep->td);
  921         unsigned short *mtus = &sc->params.mtus[0];
  922 
  923         INP_WLOCK_ASSERT(inp);
  924         MPASS(mtu > 0); /* kernel is supposed to provide something usable. */
  925 
  926         /* tp->snd_una and snd_max are in host byte order too. */
  927         seq = be32toh(seq);
  928 
  929         CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)",
  930             __func__, toep->tid, seq, mtu, toep->params.mtu_idx,
  931             mtus[toep->params.mtu_idx]);
  932 
  933         if (ulp_mode(toep) == ULP_MODE_NONE &&  /* XXX: Read TCB otherwise? */
  934             (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) {
  935                 CTR5(KTR_CXGBE,
  936                     "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).",
  937                     __func__, toep->tid, seq, tp->snd_una, tp->snd_max);
  938                 return;
  939         }
  940 
  941         /* Find the best mtu_idx for the suggested MTU. */
  942         for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++)
  943                 continue;
  944         if (idx >= toep->params.mtu_idx)
  945                 return; /* Never increase the PMTU (just like the kernel). */
  946 
  947         /*
  948          * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first
  949          * one updates the mtu_idx and the second one triggers a retransmit.
  950          */
  951         len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
  952         wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie);
  953         if (wrh == NULL) {
  954                 CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n",
  955                     toep->tid, toep->params.mtu_idx, idx);
  956                 return;
  957         }
  958         INIT_ULPTX_WRH(wrh, len, 1, 0); /* atomic */
  959         ulpmc = (struct ulp_txpkt *)(wrh + 1);
  960         ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_T_MAXSEG,
  961             V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx), toep->tid);
  962         ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_TIMESTAMP,
  963             V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0, toep->tid);
  964         commit_wrq_wr(toep->ctrlq, wrh, &cookie);
  965 
  966         /* Update the software toepcb and tcpcb. */
  967         toep->params.mtu_idx = idx;
  968         tp->t_maxseg = mtus[toep->params.mtu_idx];
  969         if (inp->inp_inc.inc_flags & INC_ISIPV6)
  970                 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
  971         else
  972                 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
  973         toep->params.emss = tp->t_maxseg;
  974         if (tp->t_flags & TF_RCVD_TSTMP)
  975                 toep->params.emss -= TCPOLEN_TSTAMP_APPA;
  976 
  977         /* Update the firmware flowc. */
  978         send_mss_flowc_wr(sc, toep);
  979 
  980         /* Update the MTU in the kernel's hostcache. */
  981         if (sc->tt.update_hc_on_pmtu_change != 0) {
  982                 struct in_conninfo inc = {0};
  983 
  984                 inc.inc_fibnum = inp->inp_inc.inc_fibnum;
  985                 if (inp->inp_inc.inc_flags & INC_ISIPV6) {
  986                         inc.inc_flags |= INC_ISIPV6;
  987                         inc.inc6_faddr = inp->inp_inc.inc6_faddr;
  988                 } else {
  989                         inc.inc_faddr = inp->inp_inc.inc_faddr;
  990                 }
  991                 tcp_hc_updatemtu(&inc, mtu);
  992         }
  993 
  994         CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
  995             __func__, toep->tid, toep->params.mtu_idx,
  996             mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss);
  997 }
  998 
  999 /*
 1000  * The TOE driver will not receive any more CPLs for the tid associated with the
 1001  * toepcb; release the hold on the inpcb.
 1002  */
 1003 void
 1004 final_cpl_received(struct toepcb *toep)
 1005 {
 1006         struct inpcb *inp = toep->inp;
 1007         bool need_wakeup;
 1008 
 1009         KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 1010         INP_WLOCK_ASSERT(inp);
 1011         KASSERT(toep->flags & TPF_CPL_PENDING,
 1012             ("%s: CPL not pending already?", __func__));
 1013 
 1014         CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 1015             __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 1016 
 1017         if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 1018                 release_ddp_resources(toep);
 1019         toep->inp = NULL;
 1020         need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0;
 1021         toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL);
 1022         mbufq_drain(&toep->ulp_pduq);
 1023         mbufq_drain(&toep->ulp_pdu_reclaimq);
 1024 
 1025         if (!(toep->flags & TPF_ATTACHED))
 1026                 release_offload_resources(toep);
 1027 
 1028         if (!in_pcbrele_wlocked(inp))
 1029                 INP_WUNLOCK(inp);
 1030 
 1031         if (need_wakeup) {
 1032                 struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
 1033 
 1034                 mtx_lock(lock);
 1035                 wakeup(toep);
 1036                 mtx_unlock(lock);
 1037         }
 1038 }
 1039 
 1040 void
 1041 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
 1042 {
 1043         struct tid_info *t = &sc->tids;
 1044 
 1045         MPASS(tid >= t->tid_base);
 1046         MPASS(tid - t->tid_base < t->ntids);
 1047 
 1048         t->tid_tab[tid - t->tid_base] = ctx;
 1049         atomic_add_int(&t->tids_in_use, ntids);
 1050 }
 1051 
 1052 void *
 1053 lookup_tid(struct adapter *sc, int tid)
 1054 {
 1055         struct tid_info *t = &sc->tids;
 1056 
 1057         return (t->tid_tab[tid - t->tid_base]);
 1058 }
 1059 
 1060 void
 1061 update_tid(struct adapter *sc, int tid, void *ctx)
 1062 {
 1063         struct tid_info *t = &sc->tids;
 1064 
 1065         t->tid_tab[tid - t->tid_base] = ctx;
 1066 }
 1067 
 1068 void
 1069 remove_tid(struct adapter *sc, int tid, int ntids)
 1070 {
 1071         struct tid_info *t = &sc->tids;
 1072 
 1073         t->tid_tab[tid - t->tid_base] = NULL;
 1074         atomic_subtract_int(&t->tids_in_use, ntids);
 1075 }
 1076 
 1077 /*
 1078  * What mtu_idx to use, given a 4-tuple.  Note that both s->mss and tcp_mssopt
 1079  * have the MSS that we should advertise in our SYN.  Advertised MSS doesn't
 1080  * account for any TCP options so the effective MSS (only payload, no headers or
 1081  * options) could be different.
 1082  */
 1083 static int
 1084 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
 1085     struct offload_settings *s)
 1086 {
 1087         unsigned short *mtus = &sc->params.mtus[0];
 1088         int i, mss, mtu;
 1089 
 1090         MPASS(inc != NULL);
 1091 
 1092         mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
 1093         if (inc->inc_flags & INC_ISIPV6)
 1094                 mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 1095         else
 1096                 mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
 1097 
 1098         for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
 1099                 continue;
 1100 
 1101         return (i);
 1102 }
 1103 
 1104 /*
 1105  * Determine the receive window size for a socket.
 1106  */
 1107 u_long
 1108 select_rcv_wnd(struct socket *so)
 1109 {
 1110         unsigned long wnd;
 1111 
 1112         SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 1113 
 1114         wnd = sbspace(&so->so_rcv);
 1115         if (wnd < MIN_RCV_WND)
 1116                 wnd = MIN_RCV_WND;
 1117 
 1118         return min(wnd, MAX_RCV_WND);
 1119 }
 1120 
 1121 int
 1122 select_rcv_wscale(void)
 1123 {
 1124         int wscale = 0;
 1125         unsigned long space = sb_max;
 1126 
 1127         if (space > MAX_RCV_WND)
 1128                 space = MAX_RCV_WND;
 1129 
 1130         while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 1131                 wscale++;
 1132 
 1133         return (wscale);
 1134 }
 1135 
 1136 __be64
 1137 calc_options0(struct vi_info *vi, struct conn_params *cp)
 1138 {
 1139         uint64_t opt0 = 0;
 1140 
 1141         opt0 |= F_TCAM_BYPASS;
 1142 
 1143         MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE);
 1144         opt0 |= V_WND_SCALE(cp->wscale);
 1145 
 1146         MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS);
 1147         opt0 |= V_MSS_IDX(cp->mtu_idx);
 1148 
 1149         MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE);
 1150         opt0 |= V_ULP_MODE(cp->ulp_mode);
 1151 
 1152         MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ);
 1153         opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize);
 1154 
 1155         MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size);
 1156         opt0 |= V_L2T_IDX(cp->l2t_idx);
 1157 
 1158         opt0 |= V_SMAC_SEL(vi->smt_idx);
 1159         opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 1160 
 1161         MPASS(cp->keepalive == 0 || cp->keepalive == 1);
 1162         opt0 |= V_KEEP_ALIVE(cp->keepalive);
 1163 
 1164         MPASS(cp->nagle == 0 || cp->nagle == 1);
 1165         opt0 |= V_NAGLE(cp->nagle);
 1166 
 1167         return (htobe64(opt0));
 1168 }
 1169 
 1170 __be32
 1171 calc_options2(struct vi_info *vi, struct conn_params *cp)
 1172 {
 1173         uint32_t opt2 = 0;
 1174         struct port_info *pi = vi->pi;
 1175         struct adapter *sc = pi->adapter;
 1176 
 1177         /*
 1178          * rx flow control, rx coalesce, congestion control, and tx pace are all
 1179          * explicitly set by the driver.  On T5+ the ISS is also set by the
 1180          * driver to the value picked by the kernel.
 1181          */
 1182         if (is_t4(sc)) {
 1183                 opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
 1184                 opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
 1185         } else {
 1186                 opt2 |= F_T5_OPT_2_VALID;       /* all 4 valid */
 1187                 opt2 |= F_T5_ISS;               /* ISS provided in CPL */
 1188         }
 1189 
 1190         MPASS(cp->sack == 0 || cp->sack == 1);
 1191         opt2 |= V_SACK_EN(cp->sack);
 1192 
 1193         MPASS(cp->tstamp == 0 || cp->tstamp == 1);
 1194         opt2 |= V_TSTAMPS_EN(cp->tstamp);
 1195 
 1196         if (cp->wscale > 0)
 1197                 opt2 |= F_WND_SCALE_EN;
 1198 
 1199         MPASS(cp->ecn == 0 || cp->ecn == 1);
 1200         opt2 |= V_CCTRL_ECN(cp->ecn);
 1201 
 1202         /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
 1203 
 1204         opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
 1205         opt2 |= V_PACE(0);
 1206         opt2 |= F_RSS_QUEUE_VALID;
 1207         opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id);
 1208 
 1209         MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL);
 1210         opt2 |= V_CONG_CNTRL(cp->cong_algo);
 1211 
 1212         MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1);
 1213         if (cp->rx_coalesce == 1)
 1214                 opt2 |= V_RX_COALESCE(M_RX_COALESCE);
 1215 
 1216         opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
 1217 #ifdef USE_DDP_RX_FLOW_CONTROL
 1218         if (cp->ulp_mode == ULP_MODE_TCPDDP)
 1219                 opt2 |= F_RX_FC_DDP;
 1220 #endif
 1221 
 1222         return (htobe32(opt2));
 1223 }
 1224 
 1225 uint64_t
 1226 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 1227 {
 1228         struct adapter *sc = vi->adapter;
 1229         struct tp_params *tp = &sc->params.tp;
 1230         uint64_t ntuple = 0;
 1231 
 1232         /*
 1233          * Initialize each of the fields which we care about which are present
 1234          * in the Compressed Filter Tuple.
 1235          */
 1236         if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE)
 1237                 ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 1238 
 1239         if (tp->port_shift >= 0)
 1240                 ntuple |= (uint64_t)e->lport << tp->port_shift;
 1241 
 1242         if (tp->protocol_shift >= 0)
 1243                 ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 1244 
 1245         if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) {
 1246                 ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) |
 1247                     V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) <<
 1248                     tp->vnic_shift;
 1249         }
 1250 
 1251         if (is_t4(sc))
 1252                 return (htobe32((uint32_t)ntuple));
 1253         else
 1254                 return (htobe64(V_FILTER_TUPLE(ntuple)));
 1255 }
 1256 
 1257 /*
 1258  * Initialize various connection parameters.
 1259  */
 1260 void
 1261 init_conn_params(struct vi_info *vi , struct offload_settings *s,
 1262     struct in_conninfo *inc, struct socket *so,
 1263     const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp)
 1264 {
 1265         struct port_info *pi = vi->pi;
 1266         struct adapter *sc = pi->adapter;
 1267         struct tom_tunables *tt = &sc->tt;
 1268         struct inpcb *inp = sotoinpcb(so);
 1269         struct tcpcb *tp = intotcpcb(inp);
 1270         u_long wnd;
 1271         u_int q_idx;
 1272 
 1273         MPASS(s->offload != 0);
 1274 
 1275         /* Congestion control algorithm */
 1276         if (s->cong_algo >= 0)
 1277                 cp->cong_algo = s->cong_algo & M_CONG_CNTRL;
 1278         else if (sc->tt.cong_algorithm >= 0)
 1279                 cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL;
 1280         else {
 1281                 struct cc_algo *cc = CC_ALGO(tp);
 1282 
 1283                 if (strcasecmp(cc->name, "reno") == 0)
 1284                         cp->cong_algo = CONG_ALG_RENO;
 1285                 else if (strcasecmp(cc->name, "tahoe") == 0)
 1286                         cp->cong_algo = CONG_ALG_TAHOE;
 1287                 if (strcasecmp(cc->name, "newreno") == 0)
 1288                         cp->cong_algo = CONG_ALG_NEWRENO;
 1289                 if (strcasecmp(cc->name, "highspeed") == 0)
 1290                         cp->cong_algo = CONG_ALG_HIGHSPEED;
 1291                 else {
 1292                         /*
 1293                          * Use newreno in case the algorithm selected by the
 1294                          * host stack is not supported by the hardware.
 1295                          */
 1296                         cp->cong_algo = CONG_ALG_NEWRENO;
 1297                 }
 1298         }
 1299 
 1300         /* Tx traffic scheduling class. */
 1301         if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls)
 1302                 cp->tc_idx = s->sched_class;
 1303         else
 1304                 cp->tc_idx = -1;
 1305 
 1306         /* Nagle's algorithm. */
 1307         if (s->nagle >= 0)
 1308                 cp->nagle = s->nagle > 0 ? 1 : 0;
 1309         else
 1310                 cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
 1311 
 1312         /* TCP Keepalive. */
 1313         if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE)
 1314                 cp->keepalive = 1;
 1315         else
 1316                 cp->keepalive = 0;
 1317 
 1318         /* Optimization that's specific to T5 @ 40G. */
 1319         if (tt->tx_align >= 0)
 1320                 cp->tx_align =  tt->tx_align > 0 ? 1 : 0;
 1321         else if (chip_id(sc) == CHELSIO_T5 &&
 1322             (port_top_speed(pi) > 10 || sc->params.nports > 2))
 1323                 cp->tx_align = 1;
 1324         else
 1325                 cp->tx_align = 0;
 1326 
 1327         /* ULP mode. */
 1328         if (s->ddp > 0 ||
 1329             (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0))
 1330                 cp->ulp_mode = ULP_MODE_TCPDDP;
 1331         else
 1332                 cp->ulp_mode = ULP_MODE_NONE;
 1333 
 1334         /* Rx coalescing. */
 1335         if (s->rx_coalesce >= 0)
 1336                 cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0;
 1337         else if (tt->rx_coalesce >= 0)
 1338                 cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0;
 1339         else
 1340                 cp->rx_coalesce = 1;    /* default */
 1341 
 1342         /*
 1343          * Index in the PMTU table.  This controls the MSS that we announce in
 1344          * our SYN initially, but after ESTABLISHED it controls the MSS that we
 1345          * use to send data.
 1346          */
 1347         cp->mtu_idx = find_best_mtu_idx(sc, inc, s);
 1348 
 1349         /* Tx queue for this connection. */
 1350         if (s->txq == QUEUE_RANDOM)
 1351                 q_idx = arc4random();
 1352         else if (s->txq == QUEUE_ROUNDROBIN)
 1353                 q_idx = atomic_fetchadd_int(&vi->txq_rr, 1);
 1354         else
 1355                 q_idx = s->txq;
 1356         cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq;
 1357 
 1358         /* Rx queue for this connection. */
 1359         if (s->rxq == QUEUE_RANDOM)
 1360                 q_idx = arc4random();
 1361         else if (s->rxq == QUEUE_ROUNDROBIN)
 1362                 q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1);
 1363         else
 1364                 q_idx = s->rxq;
 1365         cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq;
 1366 
 1367         if (SOLISTENING(so)) {
 1368                 /* Passive open */
 1369                 MPASS(tcpopt != NULL);
 1370 
 1371                 /* TCP timestamp option */
 1372                 if (tcpopt->tstamp &&
 1373                     (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
 1374                         cp->tstamp = 1;
 1375                 else
 1376                         cp->tstamp = 0;
 1377 
 1378                 /* SACK */
 1379                 if (tcpopt->sack &&
 1380                     (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack)))
 1381                         cp->sack = 1;
 1382                 else
 1383                         cp->sack = 0;
 1384 
 1385                 /* Receive window scaling. */
 1386                 if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323)
 1387                         cp->wscale = select_rcv_wscale();
 1388                 else
 1389                         cp->wscale = 0;
 1390 
 1391                 /* ECN */
 1392                 if (tcpopt->ecn &&      /* XXX: review. */
 1393                     (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
 1394                         cp->ecn = 1;
 1395                 else
 1396                         cp->ecn = 0;
 1397 
 1398                 wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
 1399                 cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
 1400 
 1401                 if (tt->sndbuf > 0)
 1402                         cp->sndbuf = tt->sndbuf;
 1403                 else if (so->sol_sbsnd_flags & SB_AUTOSIZE &&
 1404                     V_tcp_do_autosndbuf)
 1405                         cp->sndbuf = 256 * 1024;
 1406                 else
 1407                         cp->sndbuf = so->sol_sbsnd_hiwat;
 1408         } else {
 1409                 /* Active open */
 1410 
 1411                 /* TCP timestamp option */
 1412                 if (s->tstamp > 0 ||
 1413                     (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
 1414                         cp->tstamp = 1;
 1415                 else
 1416                         cp->tstamp = 0;
 1417 
 1418                 /* SACK */
 1419                 if (s->sack > 0 ||
 1420                     (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
 1421                         cp->sack = 1;
 1422                 else
 1423                         cp->sack = 0;
 1424 
 1425                 /* Receive window scaling */
 1426                 if (tp->t_flags & TF_REQ_SCALE)
 1427                         cp->wscale = select_rcv_wscale();
 1428                 else
 1429                         cp->wscale = 0;
 1430 
 1431                 /* ECN */
 1432                 if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
 1433                         cp->ecn = 1;
 1434                 else
 1435                         cp->ecn = 0;
 1436 
 1437                 SOCKBUF_LOCK(&so->so_rcv);
 1438                 wnd = max(select_rcv_wnd(so), MIN_RCV_WND);
 1439                 SOCKBUF_UNLOCK(&so->so_rcv);
 1440                 cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
 1441 
 1442                 if (tt->sndbuf > 0)
 1443                         cp->sndbuf = tt->sndbuf;
 1444                 else {
 1445                         SOCKBUF_LOCK(&so->so_snd);
 1446                         if (so->so_snd.sb_flags & SB_AUTOSIZE &&
 1447                             V_tcp_do_autosndbuf)
 1448                                 cp->sndbuf = 256 * 1024;
 1449                         else
 1450                                 cp->sndbuf = so->so_snd.sb_hiwat;
 1451                         SOCKBUF_UNLOCK(&so->so_snd);
 1452                 }
 1453         }
 1454 
 1455         cp->l2t_idx = l2t_idx;
 1456 
 1457         /* This will be initialized on ESTABLISHED. */
 1458         cp->emss = 0;
 1459 }
 1460 
 1461 int
 1462 negative_advice(int status)
 1463 {
 1464 
 1465         return (status == CPL_ERR_RTX_NEG_ADVICE ||
 1466             status == CPL_ERR_PERSIST_NEG_ADVICE ||
 1467             status == CPL_ERR_KEEPALV_NEG_ADVICE);
 1468 }
 1469 
 1470 static int
 1471 alloc_tid_tab(struct tid_info *t, int flags)
 1472 {
 1473 
 1474         MPASS(t->ntids > 0);
 1475         MPASS(t->tid_tab == NULL);
 1476 
 1477         t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
 1478             M_ZERO | flags);
 1479         if (t->tid_tab == NULL)
 1480                 return (ENOMEM);
 1481         atomic_store_rel_int(&t->tids_in_use, 0);
 1482 
 1483         return (0);
 1484 }
 1485 
 1486 static void
 1487 free_tid_tab(struct tid_info *t)
 1488 {
 1489 
 1490         KASSERT(t->tids_in_use == 0,
 1491             ("%s: %d tids still in use.", __func__, t->tids_in_use));
 1492 
 1493         free(t->tid_tab, M_CXGBE);
 1494         t->tid_tab = NULL;
 1495 }
 1496 
 1497 static int
 1498 alloc_stid_tab(struct tid_info *t, int flags)
 1499 {
 1500 
 1501         MPASS(t->nstids > 0);
 1502         MPASS(t->stid_tab == NULL);
 1503 
 1504         t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
 1505             M_ZERO | flags);
 1506         if (t->stid_tab == NULL)
 1507                 return (ENOMEM);
 1508         mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 1509         t->stids_in_use = 0;
 1510         TAILQ_INIT(&t->stids);
 1511         t->nstids_free_head = t->nstids;
 1512 
 1513         return (0);
 1514 }
 1515 
 1516 static void
 1517 free_stid_tab(struct tid_info *t)
 1518 {
 1519 
 1520         KASSERT(t->stids_in_use == 0,
 1521             ("%s: %d tids still in use.", __func__, t->stids_in_use));
 1522 
 1523         if (mtx_initialized(&t->stid_lock))
 1524                 mtx_destroy(&t->stid_lock);
 1525         free(t->stid_tab, M_CXGBE);
 1526         t->stid_tab = NULL;
 1527 }
 1528 
 1529 static void
 1530 free_tid_tabs(struct tid_info *t)
 1531 {
 1532 
 1533         free_tid_tab(t);
 1534         free_stid_tab(t);
 1535 }
 1536 
 1537 static int
 1538 alloc_tid_tabs(struct tid_info *t)
 1539 {
 1540         int rc;
 1541 
 1542         rc = alloc_tid_tab(t, M_NOWAIT);
 1543         if (rc != 0)
 1544                 goto failed;
 1545 
 1546         rc = alloc_stid_tab(t, M_NOWAIT);
 1547         if (rc != 0)
 1548                 goto failed;
 1549 
 1550         return (0);
 1551 failed:
 1552         free_tid_tabs(t);
 1553         return (rc);
 1554 }
 1555 
 1556 static inline void
 1557 alloc_tcb_history(struct adapter *sc, struct tom_data *td)
 1558 {
 1559 
 1560         if (sc->tids.ntids == 0 || sc->tids.ntids > 1024)
 1561                 return;
 1562         rw_init(&td->tcb_history_lock, "TCB history");
 1563         td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history),
 1564             M_CXGBE, M_ZERO | M_NOWAIT);
 1565         td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0));
 1566 }
 1567 
 1568 static inline void
 1569 free_tcb_history(struct adapter *sc, struct tom_data *td)
 1570 {
 1571 #ifdef INVARIANTS
 1572         int i;
 1573 
 1574         if (td->tcb_history != NULL) {
 1575                 for (i = 0; i < sc->tids.ntids; i++) {
 1576                         MPASS(td->tcb_history[i] == NULL);
 1577                 }
 1578         }
 1579 #endif
 1580         free(td->tcb_history, M_CXGBE);
 1581         if (rw_initialized(&td->tcb_history_lock))
 1582                 rw_destroy(&td->tcb_history_lock);
 1583 }
 1584 
 1585 static void
 1586 free_tom_data(struct adapter *sc, struct tom_data *td)
 1587 {
 1588 
 1589         ASSERT_SYNCHRONIZED_OP(sc);
 1590 
 1591         KASSERT(TAILQ_EMPTY(&td->toep_list),
 1592             ("%s: TOE PCB list is not empty.", __func__));
 1593         KASSERT(td->lctx_count == 0,
 1594             ("%s: lctx hash table is not empty.", __func__));
 1595 
 1596         t4_free_ppod_region(&td->pr);
 1597 
 1598         if (td->listen_mask != 0)
 1599                 hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 1600 
 1601         if (mtx_initialized(&td->unsent_wr_lock))
 1602                 mtx_destroy(&td->unsent_wr_lock);
 1603         if (mtx_initialized(&td->lctx_hash_lock))
 1604                 mtx_destroy(&td->lctx_hash_lock);
 1605         if (mtx_initialized(&td->toep_list_lock))
 1606                 mtx_destroy(&td->toep_list_lock);
 1607 
 1608         free_tcb_history(sc, td);
 1609         free_tid_tabs(&sc->tids);
 1610         free(td, M_CXGBE);
 1611 }
 1612 
 1613 static char *
 1614 prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
 1615     int *buflen)
 1616 {
 1617         char *pkt;
 1618         struct tcphdr *th;
 1619         int ipv6, len;
 1620         const int maxlen =
 1621             max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
 1622             max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
 1623             sizeof(struct tcphdr);
 1624 
 1625         MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
 1626 
 1627         pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
 1628         if (pkt == NULL)
 1629                 return (NULL);
 1630 
 1631         ipv6 = inp->inp_vflag & INP_IPV6;
 1632         len = 0;
 1633 
 1634         if (EVL_VLANOFTAG(vtag) == 0xfff) {
 1635                 struct ether_header *eh = (void *)pkt;
 1636 
 1637                 if (ipv6)
 1638                         eh->ether_type = htons(ETHERTYPE_IPV6);
 1639                 else
 1640                         eh->ether_type = htons(ETHERTYPE_IP);
 1641 
 1642                 len += sizeof(*eh);
 1643         } else {
 1644                 struct ether_vlan_header *evh = (void *)pkt;
 1645 
 1646                 evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
 1647                 evh->evl_tag = htons(vtag);
 1648                 if (ipv6)
 1649                         evh->evl_proto = htons(ETHERTYPE_IPV6);
 1650                 else
 1651                         evh->evl_proto = htons(ETHERTYPE_IP);
 1652 
 1653                 len += sizeof(*evh);
 1654         }
 1655 
 1656         if (ipv6) {
 1657                 struct ip6_hdr *ip6 = (void *)&pkt[len];
 1658 
 1659                 ip6->ip6_vfc = IPV6_VERSION;
 1660                 ip6->ip6_plen = htons(sizeof(struct tcphdr));
 1661                 ip6->ip6_nxt = IPPROTO_TCP;
 1662                 if (open_type == OPEN_TYPE_ACTIVE) {
 1663                         ip6->ip6_src = inp->in6p_laddr;
 1664                         ip6->ip6_dst = inp->in6p_faddr;
 1665                 } else if (open_type == OPEN_TYPE_LISTEN) {
 1666                         ip6->ip6_src = inp->in6p_laddr;
 1667                         ip6->ip6_dst = ip6->ip6_src;
 1668                 }
 1669 
 1670                 len += sizeof(*ip6);
 1671         } else {
 1672                 struct ip *ip = (void *)&pkt[len];
 1673 
 1674                 ip->ip_v = IPVERSION;
 1675                 ip->ip_hl = sizeof(*ip) >> 2;
 1676                 ip->ip_tos = inp->inp_ip_tos;
 1677                 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
 1678                 ip->ip_ttl = inp->inp_ip_ttl;
 1679                 ip->ip_p = IPPROTO_TCP;
 1680                 if (open_type == OPEN_TYPE_ACTIVE) {
 1681                         ip->ip_src = inp->inp_laddr;
 1682                         ip->ip_dst = inp->inp_faddr;
 1683                 } else if (open_type == OPEN_TYPE_LISTEN) {
 1684                         ip->ip_src = inp->inp_laddr;
 1685                         ip->ip_dst = ip->ip_src;
 1686                 }
 1687 
 1688                 len += sizeof(*ip);
 1689         }
 1690 
 1691         th = (void *)&pkt[len];
 1692         if (open_type == OPEN_TYPE_ACTIVE) {
 1693                 th->th_sport = inp->inp_lport;  /* network byte order already */
 1694                 th->th_dport = inp->inp_fport;  /* ditto */
 1695         } else if (open_type == OPEN_TYPE_LISTEN) {
 1696                 th->th_sport = inp->inp_lport;  /* network byte order already */
 1697                 th->th_dport = th->th_sport;
 1698         }
 1699         len += sizeof(th);
 1700 
 1701         *pktlen = *buflen = len;
 1702         return (pkt);
 1703 }
 1704 
 1705 const struct offload_settings *
 1706 lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
 1707     uint16_t vtag, struct inpcb *inp)
 1708 {
 1709         const struct t4_offload_policy *op;
 1710         char *pkt;
 1711         struct offload_rule *r;
 1712         int i, matched, pktlen, buflen;
 1713         static const struct offload_settings allow_offloading_settings = {
 1714                 .offload = 1,
 1715                 .rx_coalesce = -1,
 1716                 .cong_algo = -1,
 1717                 .sched_class = -1,
 1718                 .tstamp = -1,
 1719                 .sack = -1,
 1720                 .nagle = -1,
 1721                 .ecn = -1,
 1722                 .ddp = -1,
 1723                 .tls = -1,
 1724                 .txq = QUEUE_RANDOM,
 1725                 .rxq = QUEUE_RANDOM,
 1726                 .mss = -1,
 1727         };
 1728         static const struct offload_settings disallow_offloading_settings = {
 1729                 .offload = 0,
 1730                 /* rest is irrelevant when offload is off. */
 1731         };
 1732 
 1733         rw_assert(&sc->policy_lock, RA_LOCKED);
 1734 
 1735         /*
 1736          * If there's no Connection Offloading Policy attached to the device
 1737          * then we need to return a default static policy.  If
 1738          * "cop_managed_offloading" is true, then we need to disallow
 1739          * offloading until a COP is attached to the device.  Otherwise we
 1740          * allow offloading ...
 1741          */
 1742         op = sc->policy;
 1743         if (op == NULL) {
 1744                 if (sc->tt.cop_managed_offloading)
 1745                         return (&disallow_offloading_settings);
 1746                 else
 1747                         return (&allow_offloading_settings);
 1748         }
 1749 
 1750         switch (open_type) {
 1751         case OPEN_TYPE_ACTIVE:
 1752         case OPEN_TYPE_LISTEN:
 1753                 pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen);
 1754                 break;
 1755         case OPEN_TYPE_PASSIVE:
 1756                 MPASS(m != NULL);
 1757                 pkt = mtod(m, char *);
 1758                 MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
 1759                 pkt += sizeof(struct cpl_pass_accept_req);
 1760                 pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
 1761                 buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
 1762                 break;
 1763         default:
 1764                 MPASS(0);
 1765                 return (&disallow_offloading_settings);
 1766         }
 1767 
 1768         if (pkt == NULL || pktlen == 0 || buflen == 0)
 1769                 return (&disallow_offloading_settings);
 1770 
 1771         matched = 0;
 1772         r = &op->rule[0];
 1773         for (i = 0; i < op->nrules; i++, r++) {
 1774                 if (r->open_type != open_type &&
 1775                     r->open_type != OPEN_TYPE_DONTCARE) {
 1776                         continue;
 1777                 }
 1778                 matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
 1779                 if (matched)
 1780                         break;
 1781         }
 1782 
 1783         if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
 1784                 free(pkt, M_CXGBE);
 1785 
 1786         return (matched ? &r->settings : &disallow_offloading_settings);
 1787 }
 1788 
 1789 static void
 1790 reclaim_wr_resources(void *arg, int count)
 1791 {
 1792         struct tom_data *td = arg;
 1793         STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 1794         struct cpl_act_open_req *cpl;
 1795         u_int opcode, atid, tid;
 1796         struct wrqe *wr;
 1797         struct adapter *sc = td_adapter(td);
 1798 
 1799         mtx_lock(&td->unsent_wr_lock);
 1800         STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 1801         mtx_unlock(&td->unsent_wr_lock);
 1802 
 1803         while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 1804                 STAILQ_REMOVE_HEAD(&twr_list, link);
 1805 
 1806                 cpl = wrtod(wr);
 1807                 opcode = GET_OPCODE(cpl);
 1808 
 1809                 switch (opcode) {
 1810                 case CPL_ACT_OPEN_REQ:
 1811                 case CPL_ACT_OPEN_REQ6:
 1812                         atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 1813                         CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 1814                         act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 1815                         free(wr, M_CXGBE);
 1816                         break;
 1817                 case CPL_PASS_ACCEPT_RPL:
 1818                         tid = GET_TID(cpl);
 1819                         CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid);
 1820                         synack_failure_cleanup(sc, tid);
 1821                         free(wr, M_CXGBE);
 1822                         break;
 1823                 default:
 1824                         log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 1825                             "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 1826                         /* WR not freed here; go look at it with a debugger.  */
 1827                 }
 1828         }
 1829 }
 1830 
 1831 /*
 1832  * Ground control to Major TOM
 1833  * Commencing countdown, engines on
 1834  */
 1835 static int
 1836 t4_tom_activate(struct adapter *sc)
 1837 {
 1838         struct tom_data *td;
 1839         struct toedev *tod;
 1840         struct vi_info *vi;
 1841         int i, rc, v;
 1842 
 1843         ASSERT_SYNCHRONIZED_OP(sc);
 1844 
 1845         /* per-adapter softc for TOM */
 1846         td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 1847         if (td == NULL)
 1848                 return (ENOMEM);
 1849 
 1850         /* List of TOE PCBs and associated lock */
 1851         mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 1852         TAILQ_INIT(&td->toep_list);
 1853 
 1854         /* Listen context */
 1855         mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 1856         td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 1857             &td->listen_mask, HASH_NOWAIT);
 1858 
 1859         /* List of WRs for which L2 resolution failed */
 1860         mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 1861         STAILQ_INIT(&td->unsent_wr_list);
 1862         TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 1863 
 1864         /* TID tables */
 1865         rc = alloc_tid_tabs(&sc->tids);
 1866         if (rc != 0)
 1867                 goto done;
 1868 
 1869         rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
 1870             t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
 1871         if (rc != 0)
 1872                 goto done;
 1873         t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
 1874             V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 1875 
 1876         alloc_tcb_history(sc, td);
 1877 
 1878         /* toedev ops */
 1879         tod = &td->tod;
 1880         init_toedev(tod);
 1881         tod->tod_softc = sc;
 1882         tod->tod_connect = t4_connect;
 1883         tod->tod_listen_start = t4_listen_start;
 1884         tod->tod_listen_stop = t4_listen_stop;
 1885         tod->tod_rcvd = t4_rcvd;
 1886         tod->tod_output = t4_tod_output;
 1887         tod->tod_send_rst = t4_send_rst;
 1888         tod->tod_send_fin = t4_send_fin;
 1889         tod->tod_pcb_detach = t4_pcb_detach;
 1890         tod->tod_l2_update = t4_l2_update;
 1891         tod->tod_syncache_added = t4_syncache_added;
 1892         tod->tod_syncache_removed = t4_syncache_removed;
 1893         tod->tod_syncache_respond = t4_syncache_respond;
 1894         tod->tod_offload_socket = t4_offload_socket;
 1895         tod->tod_ctloutput = t4_ctloutput;
 1896         tod->tod_tcp_info = t4_tcp_info;
 1897 #ifdef KERN_TLS
 1898         tod->tod_alloc_tls_session = t4_alloc_tls_session;
 1899 #endif
 1900         tod->tod_pmtu_update = t4_pmtu_update;
 1901 
 1902         for_each_port(sc, i) {
 1903                 for_each_vi(sc->port[i], v, vi) {
 1904                         TOEDEV(vi->ifp) = &td->tod;
 1905                 }
 1906         }
 1907 
 1908         sc->tom_softc = td;
 1909         register_toedev(sc->tom_softc);
 1910 
 1911 done:
 1912         if (rc != 0)
 1913                 free_tom_data(sc, td);
 1914         return (rc);
 1915 }
 1916 
 1917 static int
 1918 t4_tom_deactivate(struct adapter *sc)
 1919 {
 1920         int rc = 0;
 1921         struct tom_data *td = sc->tom_softc;
 1922 
 1923         ASSERT_SYNCHRONIZED_OP(sc);
 1924 
 1925         if (td == NULL)
 1926                 return (0);     /* XXX. KASSERT? */
 1927 
 1928         if (sc->offload_map != 0)
 1929                 return (EBUSY); /* at least one port has IFCAP_TOE enabled */
 1930 
 1931         if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 1932                 return (EBUSY); /* both iWARP and iSCSI rely on the TOE. */
 1933 
 1934         mtx_lock(&td->toep_list_lock);
 1935         if (!TAILQ_EMPTY(&td->toep_list))
 1936                 rc = EBUSY;
 1937         mtx_unlock(&td->toep_list_lock);
 1938 
 1939         mtx_lock(&td->lctx_hash_lock);
 1940         if (td->lctx_count > 0)
 1941                 rc = EBUSY;
 1942         mtx_unlock(&td->lctx_hash_lock);
 1943 
 1944         taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 1945         mtx_lock(&td->unsent_wr_lock);
 1946         if (!STAILQ_EMPTY(&td->unsent_wr_list))
 1947                 rc = EBUSY;
 1948         mtx_unlock(&td->unsent_wr_lock);
 1949 
 1950         if (rc == 0) {
 1951                 unregister_toedev(sc->tom_softc);
 1952                 free_tom_data(sc, td);
 1953                 sc->tom_softc = NULL;
 1954         }
 1955 
 1956         return (rc);
 1957 }
 1958 
 1959 static int
 1960 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 1961 {
 1962         struct tcpcb *tp = sototcpcb(so);
 1963         struct toepcb *toep = tp->t_toe;
 1964         int error;
 1965 
 1966         /*
 1967          * No lock is needed as TOE sockets never change between
 1968          * active and passive.
 1969          */
 1970         if (SOLISTENING(so))
 1971                 return (EINVAL);
 1972 
 1973         if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 1974                 error = t4_aio_queue_ddp(so, job);
 1975                 if (error != EOPNOTSUPP)
 1976                         return (error);
 1977         }
 1978 
 1979         return (t4_aio_queue_aiotx(so, job));
 1980 }
 1981 
 1982 static int
 1983 t4_tom_mod_load(void)
 1984 {
 1985         /* CPL handlers */
 1986         t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
 1987         t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2,
 1988             CPL_COOKIE_TOM);
 1989         t4_init_connect_cpl_handlers();
 1990         t4_init_listen_cpl_handlers();
 1991         t4_init_cpl_io_handlers();
 1992 
 1993         t4_ddp_mod_load();
 1994         t4_tls_mod_load();
 1995 
 1996         bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
 1997         toe_protosw.pr_aio_queue = t4_aio_queue_tom;
 1998 
 1999         bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
 2000         toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
 2001 
 2002         return (t4_register_uld(&tom_uld_info));
 2003 }
 2004 
 2005 static void
 2006 tom_uninit(struct adapter *sc, void *arg __unused)
 2007 {
 2008         if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 2009                 return;
 2010 
 2011         /* Try to free resources (works only if no port has IFCAP_TOE) */
 2012         if (uld_active(sc, ULD_TOM))
 2013                 t4_deactivate_uld(sc, ULD_TOM);
 2014 
 2015         end_synchronized_op(sc, 0);
 2016 }
 2017 
 2018 static int
 2019 t4_tom_mod_unload(void)
 2020 {
 2021         t4_iterate(tom_uninit, NULL);
 2022 
 2023         if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 2024                 return (EBUSY);
 2025 
 2026         t4_tls_mod_unload();
 2027         t4_ddp_mod_unload();
 2028 
 2029         t4_uninit_connect_cpl_handlers();
 2030         t4_uninit_listen_cpl_handlers();
 2031         t4_uninit_cpl_io_handlers();
 2032         t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM);
 2033         t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL);
 2034 
 2035         return (0);
 2036 }
 2037 #endif  /* TCP_OFFLOAD */
 2038 
 2039 static int
 2040 t4_tom_modevent(module_t mod, int cmd, void *arg)
 2041 {
 2042         int rc = 0;
 2043 
 2044 #ifdef TCP_OFFLOAD
 2045         switch (cmd) {
 2046         case MOD_LOAD:
 2047                 rc = t4_tom_mod_load();
 2048                 break;
 2049 
 2050         case MOD_UNLOAD:
 2051                 rc = t4_tom_mod_unload();
 2052                 break;
 2053 
 2054         default:
 2055                 rc = EINVAL;
 2056         }
 2057 #else
 2058         printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 2059         rc = EOPNOTSUPP;
 2060 #endif
 2061         return (rc);
 2062 }
 2063 
 2064 static moduledata_t t4_tom_moddata= {
 2065         "t4_tom",
 2066         t4_tom_modevent,
 2067         0
 2068 };
 2069 
 2070 MODULE_VERSION(t4_tom, 1);
 2071 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 2072 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 2073 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);

Cache object: 02c3e4409fd48fae0f404f71fcf3c02d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.