The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/cxgbe/tom/t4_cpl_io.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2012, 2015 Chelsio Communications, Inc.
    5  * All rights reserved.
    6  * Written by: Navdeep Parhar <np@FreeBSD.org>
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD$");
   32 
   33 #include "opt_inet.h"
   34 #include "opt_inet6.h"
   35 #include "opt_kern_tls.h"
   36 #include "opt_ratelimit.h"
   37 
   38 #ifdef TCP_OFFLOAD
   39 #include <sys/param.h>
   40 #include <sys/aio.h>
   41 #include <sys/file.h>
   42 #include <sys/kernel.h>
   43 #include <sys/ktr.h>
   44 #include <sys/module.h>
   45 #include <sys/proc.h>
   46 #include <sys/protosw.h>
   47 #include <sys/domain.h>
   48 #include <sys/socket.h>
   49 #include <sys/socketvar.h>
   50 #include <sys/sglist.h>
   51 #include <sys/taskqueue.h>
   52 #include <netinet/in.h>
   53 #include <netinet/in_pcb.h>
   54 #include <netinet/ip.h>
   55 #include <netinet/ip6.h>
   56 #define TCPSTATES
   57 #include <netinet/tcp_fsm.h>
   58 #include <netinet/tcp_seq.h>
   59 #include <netinet/tcp_var.h>
   60 #include <netinet/toecore.h>
   61 
   62 #include <security/mac/mac_framework.h>
   63 
   64 #include <vm/vm.h>
   65 #include <vm/vm_extern.h>
   66 #include <vm/pmap.h>
   67 #include <vm/vm_map.h>
   68 #include <vm/vm_page.h>
   69 
   70 #include <dev/iscsi/iscsi_proto.h>
   71 
   72 #include "common/common.h"
   73 #include "common/t4_msg.h"
   74 #include "common/t4_regs.h"
   75 #include "common/t4_tcb.h"
   76 #include "tom/t4_tom_l2t.h"
   77 #include "tom/t4_tom.h"
   78 
   79 static void     t4_aiotx_cancel(struct kaiocb *job);
   80 static void     t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep);
   81 
   82 void
   83 send_flowc_wr(struct toepcb *toep, struct tcpcb *tp)
   84 {
   85         struct wrqe *wr;
   86         struct fw_flowc_wr *flowc;
   87         unsigned int nparams, flowclen, paramidx;
   88         struct vi_info *vi = toep->vi;
   89         struct port_info *pi = vi->pi;
   90         struct adapter *sc = pi->adapter;
   91         unsigned int pfvf = sc->pf << S_FW_VIID_PFN;
   92         struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
   93 
   94         KASSERT(!(toep->flags & TPF_FLOWC_WR_SENT),
   95             ("%s: flowc for tid %u sent already", __func__, toep->tid));
   96 
   97         if (tp != NULL)
   98                 nparams = 8;
   99         else
  100                 nparams = 6;
  101         if (toep->params.tc_idx != -1) {
  102                 MPASS(toep->params.tc_idx >= 0 &&
  103                     toep->params.tc_idx < sc->params.nsched_cls);
  104                 nparams++;
  105         }
  106 
  107         flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
  108 
  109         wr = alloc_wrqe(roundup2(flowclen, 16), &toep->ofld_txq->wrq);
  110         if (wr == NULL) {
  111                 /* XXX */
  112                 panic("%s: allocation failure.", __func__);
  113         }
  114         flowc = wrtod(wr);
  115         memset(flowc, 0, wr->wr_len);
  116 
  117         flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
  118             V_FW_FLOWC_WR_NPARAMS(nparams));
  119         flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
  120             V_FW_WR_FLOWID(toep->tid));
  121 
  122 #define FLOWC_PARAM(__m, __v) \
  123         do { \
  124                 flowc->mnemval[paramidx].mnemonic = FW_FLOWC_MNEM_##__m; \
  125                 flowc->mnemval[paramidx].val = htobe32(__v); \
  126                 paramidx++; \
  127         } while (0)
  128 
  129         paramidx = 0;
  130 
  131         FLOWC_PARAM(PFNVFN, pfvf);
  132         FLOWC_PARAM(CH, pi->tx_chan);
  133         FLOWC_PARAM(PORT, pi->tx_chan);
  134         FLOWC_PARAM(IQID, toep->ofld_rxq->iq.abs_id);
  135         FLOWC_PARAM(SNDBUF, toep->params.sndbuf);
  136         if (tp) {
  137                 FLOWC_PARAM(MSS, toep->params.emss);
  138                 FLOWC_PARAM(SNDNXT, tp->snd_nxt);
  139                 FLOWC_PARAM(RCVNXT, tp->rcv_nxt);
  140         } else
  141                 FLOWC_PARAM(MSS, 512);
  142         CTR6(KTR_CXGBE,
  143             "%s: tid %u, mss %u, sndbuf %u, snd_nxt 0x%x, rcv_nxt 0x%x",
  144             __func__, toep->tid, toep->params.emss, toep->params.sndbuf,
  145             tp ? tp->snd_nxt : 0, tp ? tp->rcv_nxt : 0);
  146 
  147         if (toep->params.tc_idx != -1)
  148                 FLOWC_PARAM(SCHEDCLASS, toep->params.tc_idx);
  149 #undef FLOWC_PARAM
  150 
  151         KASSERT(paramidx == nparams, ("nparams mismatch"));
  152 
  153         txsd->tx_credits = howmany(flowclen, 16);
  154         txsd->plen = 0;
  155         KASSERT(toep->tx_credits >= txsd->tx_credits && toep->txsd_avail > 0,
  156             ("%s: not enough credits (%d)", __func__, toep->tx_credits));
  157         toep->tx_credits -= txsd->tx_credits;
  158         if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
  159                 toep->txsd_pidx = 0;
  160         toep->txsd_avail--;
  161 
  162         toep->flags |= TPF_FLOWC_WR_SENT;
  163         t4_wrq_tx(sc, wr);
  164 }
  165 
  166 #ifdef RATELIMIT
  167 /*
  168  * Input is Bytes/second (so_max_pacing_rate), chip counts in Kilobits/second.
  169  */
  170 static int
  171 update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps)
  172 {
  173         int tc_idx, rc;
  174         const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000;
  175         const int port_id = toep->vi->pi->port_id;
  176 
  177         CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps);
  178 
  179         if (kbps == 0) {
  180                 /* unbind */
  181                 tc_idx = -1;
  182         } else {
  183                 rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx);
  184                 if (rc != 0)
  185                         return (rc);
  186                 MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
  187         }
  188 
  189         if (toep->params.tc_idx != tc_idx) {
  190                 struct wrqe *wr;
  191                 struct fw_flowc_wr *flowc;
  192                 int nparams = 1, flowclen, flowclen16;
  193                 struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
  194 
  195                 flowclen = sizeof(*flowc) + nparams * sizeof(struct
  196                     fw_flowc_mnemval);
  197                 flowclen16 = howmany(flowclen, 16);
  198                 if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 ||
  199                     (wr = alloc_wrqe(roundup2(flowclen, 16),
  200                     &toep->ofld_txq->wrq)) == NULL) {
  201                         if (tc_idx >= 0)
  202                                 t4_release_cl_rl(sc, port_id, tc_idx);
  203                         return (ENOMEM);
  204                 }
  205 
  206                 flowc = wrtod(wr);
  207                 memset(flowc, 0, wr->wr_len);
  208 
  209                 flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
  210                     V_FW_FLOWC_WR_NPARAMS(nparams));
  211                 flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
  212                     V_FW_WR_FLOWID(toep->tid));
  213 
  214                 flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS;
  215                 if (tc_idx == -1)
  216                         flowc->mnemval[0].val = htobe32(0xff);
  217                 else
  218                         flowc->mnemval[0].val = htobe32(tc_idx);
  219 
  220                 txsd->tx_credits = flowclen16;
  221                 txsd->plen = 0;
  222                 toep->tx_credits -= txsd->tx_credits;
  223                 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
  224                         toep->txsd_pidx = 0;
  225                 toep->txsd_avail--;
  226                 t4_wrq_tx(sc, wr);
  227         }
  228 
  229         if (toep->params.tc_idx >= 0)
  230                 t4_release_cl_rl(sc, port_id, toep->params.tc_idx);
  231         toep->params.tc_idx = tc_idx;
  232 
  233         return (0);
  234 }
  235 #endif
  236 
  237 void
  238 send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt)
  239 {
  240         struct wrqe *wr;
  241         struct cpl_abort_req *req;
  242         int tid = toep->tid;
  243         struct inpcb *inp = toep->inp;
  244         struct tcpcb *tp = intotcpcb(inp);      /* don't use if INP_DROPPED */
  245 
  246         INP_WLOCK_ASSERT(inp);
  247 
  248         CTR6(KTR_CXGBE, "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x%s",
  249             __func__, toep->tid,
  250             inp->inp_flags & INP_DROPPED ? "inp dropped" :
  251             tcpstates[tp->t_state],
  252             toep->flags, inp->inp_flags,
  253             toep->flags & TPF_ABORT_SHUTDOWN ?
  254             " (abort already in progress)" : "");
  255 
  256         if (toep->flags & TPF_ABORT_SHUTDOWN)
  257                 return; /* abort already in progress */
  258 
  259         toep->flags |= TPF_ABORT_SHUTDOWN;
  260 
  261         KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
  262             ("%s: flowc_wr not sent for tid %d.", __func__, tid));
  263 
  264         wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
  265         if (wr == NULL) {
  266                 /* XXX */
  267                 panic("%s: allocation failure.", __func__);
  268         }
  269         req = wrtod(wr);
  270 
  271         INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, tid);
  272         if (inp->inp_flags & INP_DROPPED)
  273                 req->rsvd0 = htobe32(snd_nxt);
  274         else
  275                 req->rsvd0 = htobe32(tp->snd_nxt);
  276         req->rsvd1 = !(toep->flags & TPF_TX_DATA_SENT);
  277         req->cmd = CPL_ABORT_SEND_RST;
  278 
  279         /*
  280          * XXX: What's the correct way to tell that the inp hasn't been detached
  281          * from its socket?  Should I even be flushing the snd buffer here?
  282          */
  283         if ((inp->inp_flags & INP_DROPPED) == 0) {
  284                 struct socket *so = inp->inp_socket;
  285 
  286                 if (so != NULL) /* because I'm not sure.  See comment above */
  287                         sbflush(&so->so_snd);
  288         }
  289 
  290         t4_l2t_send(sc, wr, toep->l2te);
  291 }
  292 
  293 /*
  294  * Called when a connection is established to translate the TCP options
  295  * reported by HW to FreeBSD's native format.
  296  */
  297 static void
  298 assign_rxopt(struct tcpcb *tp, uint16_t opt)
  299 {
  300         struct toepcb *toep = tp->t_toe;
  301         struct inpcb *inp = tptoinpcb(tp);
  302         struct adapter *sc = td_adapter(toep->td);
  303 
  304         INP_LOCK_ASSERT(inp);
  305 
  306         toep->params.mtu_idx = G_TCPOPT_MSS(opt);
  307         tp->t_maxseg = sc->params.mtus[toep->params.mtu_idx];
  308         if (inp->inp_inc.inc_flags & INC_ISIPV6)
  309                 tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
  310         else
  311                 tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
  312 
  313         toep->params.emss = tp->t_maxseg;
  314         if (G_TCPOPT_TSTAMP(opt)) {
  315                 toep->params.tstamp = 1;
  316                 toep->params.emss -= TCPOLEN_TSTAMP_APPA;
  317                 tp->t_flags |= TF_RCVD_TSTMP;   /* timestamps ok */
  318                 tp->ts_recent = 0;              /* hmmm */
  319                 tp->ts_recent_age = tcp_ts_getticks();
  320         } else
  321                 toep->params.tstamp = 0;
  322 
  323         if (G_TCPOPT_SACK(opt)) {
  324                 toep->params.sack = 1;
  325                 tp->t_flags |= TF_SACK_PERMIT;  /* should already be set */
  326         } else {
  327                 toep->params.sack = 0;
  328                 tp->t_flags &= ~TF_SACK_PERMIT; /* sack disallowed by peer */
  329         }
  330 
  331         if (G_TCPOPT_WSCALE_OK(opt))
  332                 tp->t_flags |= TF_RCVD_SCALE;
  333 
  334         /* Doing window scaling? */
  335         if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
  336             (TF_RCVD_SCALE | TF_REQ_SCALE)) {
  337                 tp->rcv_scale = tp->request_r_scale;
  338                 tp->snd_scale = G_TCPOPT_SND_WSCALE(opt);
  339         } else
  340                 toep->params.wscale = 0;
  341 
  342         CTR6(KTR_CXGBE,
  343             "assign_rxopt: tid %d, mtu_idx %u, emss %u, ts %u, sack %u, wscale %u",
  344             toep->tid, toep->params.mtu_idx, toep->params.emss,
  345             toep->params.tstamp, toep->params.sack, toep->params.wscale);
  346 }
  347 
  348 /*
  349  * Completes some final bits of initialization for just established connections
  350  * and changes their state to TCPS_ESTABLISHED.
  351  *
  352  * The ISNs are from the exchange of SYNs.
  353  */
  354 void
  355 make_established(struct toepcb *toep, uint32_t iss, uint32_t irs, uint16_t opt)
  356 {
  357         struct inpcb *inp = toep->inp;
  358         struct socket *so = inp->inp_socket;
  359         struct tcpcb *tp = intotcpcb(inp);
  360         uint16_t tcpopt = be16toh(opt);
  361 
  362         INP_WLOCK_ASSERT(inp);
  363         KASSERT(tp->t_state == TCPS_SYN_SENT ||
  364             tp->t_state == TCPS_SYN_RECEIVED,
  365             ("%s: TCP state %s", __func__, tcpstates[tp->t_state]));
  366 
  367         CTR6(KTR_CXGBE, "%s: tid %d, so %p, inp %p, tp %p, toep %p",
  368             __func__, toep->tid, so, inp, tp, toep);
  369 
  370         tcp_state_change(tp, TCPS_ESTABLISHED);
  371         tp->t_starttime = ticks;
  372         TCPSTAT_INC(tcps_connects);
  373 
  374         tp->irs = irs;
  375         tcp_rcvseqinit(tp);
  376         tp->rcv_wnd = (u_int)toep->params.opt0_bufsize << 10;
  377         tp->rcv_adv += tp->rcv_wnd;
  378         tp->last_ack_sent = tp->rcv_nxt;
  379 
  380         tp->iss = iss;
  381         tcp_sendseqinit(tp);
  382         tp->snd_una = iss + 1;
  383         tp->snd_nxt = iss + 1;
  384         tp->snd_max = iss + 1;
  385 
  386         assign_rxopt(tp, tcpopt);
  387         send_flowc_wr(toep, tp);
  388 
  389         soisconnected(so);
  390 }
  391 
  392 int
  393 send_rx_credits(struct adapter *sc, struct toepcb *toep, int credits)
  394 {
  395         struct wrqe *wr;
  396         struct cpl_rx_data_ack *req;
  397         uint32_t dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
  398 
  399         KASSERT(credits >= 0, ("%s: %d credits", __func__, credits));
  400 
  401         wr = alloc_wrqe(sizeof(*req), toep->ctrlq);
  402         if (wr == NULL)
  403                 return (0);
  404         req = wrtod(wr);
  405 
  406         INIT_TP_WR_MIT_CPL(req, CPL_RX_DATA_ACK, toep->tid);
  407         req->credit_dack = htobe32(dack | V_RX_CREDITS(credits));
  408 
  409         t4_wrq_tx(sc, wr);
  410         return (credits);
  411 }
  412 
  413 void
  414 t4_rcvd_locked(struct toedev *tod, struct tcpcb *tp)
  415 {
  416         struct adapter *sc = tod->tod_softc;
  417         struct inpcb *inp = tptoinpcb(tp);
  418         struct socket *so = inp->inp_socket;
  419         struct sockbuf *sb = &so->so_rcv;
  420         struct toepcb *toep = tp->t_toe;
  421         int rx_credits;
  422 
  423         INP_WLOCK_ASSERT(inp);
  424         SOCKBUF_LOCK_ASSERT(sb);
  425 
  426         rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
  427         if (rx_credits > 0 &&
  428             (tp->rcv_wnd <= 32 * 1024 || rx_credits >= 64 * 1024 ||
  429             (rx_credits >= 16 * 1024 && tp->rcv_wnd <= 128 * 1024) ||
  430             sbused(sb) + tp->rcv_wnd < sb->sb_lowat)) {
  431                 rx_credits = send_rx_credits(sc, toep, rx_credits);
  432                 tp->rcv_wnd += rx_credits;
  433                 tp->rcv_adv += rx_credits;
  434         }
  435 }
  436 
  437 void
  438 t4_rcvd(struct toedev *tod, struct tcpcb *tp)
  439 {
  440         struct inpcb *inp = tptoinpcb(tp);
  441         struct socket *so = inp->inp_socket;
  442         struct sockbuf *sb = &so->so_rcv;
  443 
  444         SOCKBUF_LOCK(sb);
  445         t4_rcvd_locked(tod, tp);
  446         SOCKBUF_UNLOCK(sb);
  447 }
  448 
  449 /*
  450  * Close a connection by sending a CPL_CLOSE_CON_REQ message.
  451  */
  452 int
  453 t4_close_conn(struct adapter *sc, struct toepcb *toep)
  454 {
  455         struct wrqe *wr;
  456         struct cpl_close_con_req *req;
  457         unsigned int tid = toep->tid;
  458 
  459         CTR3(KTR_CXGBE, "%s: tid %u%s", __func__, toep->tid,
  460             toep->flags & TPF_FIN_SENT ? ", IGNORED" : "");
  461 
  462         if (toep->flags & TPF_FIN_SENT)
  463                 return (0);
  464 
  465         KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
  466             ("%s: flowc_wr not sent for tid %u.", __func__, tid));
  467 
  468         wr = alloc_wrqe(sizeof(*req), &toep->ofld_txq->wrq);
  469         if (wr == NULL) {
  470                 /* XXX */
  471                 panic("%s: allocation failure.", __func__);
  472         }
  473         req = wrtod(wr);
  474 
  475         req->wr.wr_hi = htonl(V_FW_WR_OP(FW_TP_WR) |
  476             V_FW_WR_IMMDLEN(sizeof(*req) - sizeof(req->wr)));
  477         req->wr.wr_mid = htonl(V_FW_WR_LEN16(howmany(sizeof(*req), 16)) |
  478             V_FW_WR_FLOWID(tid));
  479         req->wr.wr_lo = cpu_to_be64(0);
  480         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
  481         req->rsvd = 0;
  482 
  483         toep->flags |= TPF_FIN_SENT;
  484         toep->flags &= ~TPF_SEND_FIN;
  485         t4_l2t_send(sc, wr, toep->l2te);
  486 
  487         return (0);
  488 }
  489 
  490 #define MAX_OFLD_TX_CREDITS (SGE_MAX_WR_LEN / 16)
  491 #define MIN_OFLD_TX_CREDITS (howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16))
  492 #define MIN_ISO_TX_CREDITS  (howmany(sizeof(struct cpl_tx_data_iso), 16))
  493 #define MIN_TX_CREDITS(iso)                                             \
  494         (MIN_OFLD_TX_CREDITS + ((iso) ? MIN_ISO_TX_CREDITS : 0))
  495 
  496 /* Maximum amount of immediate data we could stuff in a WR */
  497 static inline int
  498 max_imm_payload(int tx_credits, int iso)
  499 {
  500         const int iso_cpl_size = iso ? sizeof(struct cpl_tx_data_iso) : 0;
  501         const int n = 1;        /* Use no more than one desc for imm. data WR */
  502 
  503         KASSERT(tx_credits >= 0 &&
  504                 tx_credits <= MAX_OFLD_TX_CREDITS,
  505                 ("%s: %d credits", __func__, tx_credits));
  506 
  507         if (tx_credits < MIN_TX_CREDITS(iso))
  508                 return (0);
  509 
  510         if (tx_credits >= (n * EQ_ESIZE) / 16)
  511                 return ((n * EQ_ESIZE) - sizeof(struct fw_ofld_tx_data_wr) -
  512                     iso_cpl_size);
  513         else
  514                 return (tx_credits * 16 - sizeof(struct fw_ofld_tx_data_wr) -
  515                     iso_cpl_size);
  516 }
  517 
  518 /* Maximum number of SGL entries we could stuff in a WR */
  519 static inline int
  520 max_dsgl_nsegs(int tx_credits, int iso)
  521 {
  522         int nseg = 1;   /* ulptx_sgl has room for 1, rest ulp_tx_sge_pair */
  523         int sge_pair_credits = tx_credits - MIN_TX_CREDITS(iso);
  524 
  525         KASSERT(tx_credits >= 0 &&
  526                 tx_credits <= MAX_OFLD_TX_CREDITS,
  527                 ("%s: %d credits", __func__, tx_credits));
  528 
  529         if (tx_credits < MIN_TX_CREDITS(iso))
  530                 return (0);
  531 
  532         nseg += 2 * (sge_pair_credits * 16 / 24);
  533         if ((sge_pair_credits * 16) % 24 == 16)
  534                 nseg++;
  535 
  536         return (nseg);
  537 }
  538 
  539 static inline void
  540 write_tx_wr(void *dst, struct toepcb *toep, int fw_wr_opcode,
  541     unsigned int immdlen, unsigned int plen, uint8_t credits, int shove,
  542     int ulp_submode)
  543 {
  544         struct fw_ofld_tx_data_wr *txwr = dst;
  545 
  546         txwr->op_to_immdlen = htobe32(V_WR_OP(fw_wr_opcode) |
  547             V_FW_WR_IMMDLEN(immdlen));
  548         txwr->flowid_len16 = htobe32(V_FW_WR_FLOWID(toep->tid) |
  549             V_FW_WR_LEN16(credits));
  550         txwr->lsodisable_to_flags = htobe32(V_TX_ULP_MODE(ulp_mode(toep)) |
  551             V_TX_ULP_SUBMODE(ulp_submode) | V_TX_URG(0) | V_TX_SHOVE(shove));
  552         txwr->plen = htobe32(plen);
  553 
  554         if (toep->params.tx_align > 0) {
  555                 if (plen < 2 * toep->params.emss)
  556                         txwr->lsodisable_to_flags |=
  557                             htobe32(F_FW_OFLD_TX_DATA_WR_LSODISABLE);
  558                 else
  559                         txwr->lsodisable_to_flags |=
  560                             htobe32(F_FW_OFLD_TX_DATA_WR_ALIGNPLD |
  561                                 (toep->params.nagle == 0 ? 0 :
  562                                 F_FW_OFLD_TX_DATA_WR_ALIGNPLDSHOVE));
  563         }
  564 }
  565 
  566 /*
  567  * Generate a DSGL from a starting mbuf.  The total number of segments and the
  568  * maximum segments in any one mbuf are provided.
  569  */
  570 static void
  571 write_tx_sgl(void *dst, struct mbuf *start, struct mbuf *stop, int nsegs, int n)
  572 {
  573         struct mbuf *m;
  574         struct ulptx_sgl *usgl = dst;
  575         int i, j, rc;
  576         struct sglist sg;
  577         struct sglist_seg segs[n];
  578 
  579         KASSERT(nsegs > 0, ("%s: nsegs 0", __func__));
  580 
  581         sglist_init(&sg, n, segs);
  582         usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
  583             V_ULPTX_NSGE(nsegs));
  584 
  585         i = -1;
  586         for (m = start; m != stop; m = m->m_next) {
  587                 if (m->m_flags & M_EXTPG)
  588                         rc = sglist_append_mbuf_epg(&sg, m,
  589                             mtod(m, vm_offset_t), m->m_len);
  590                 else
  591                         rc = sglist_append(&sg, mtod(m, void *), m->m_len);
  592                 if (__predict_false(rc != 0))
  593                         panic("%s: sglist_append %d", __func__, rc);
  594 
  595                 for (j = 0; j < sg.sg_nseg; i++, j++) {
  596                         if (i < 0) {
  597                                 usgl->len0 = htobe32(segs[j].ss_len);
  598                                 usgl->addr0 = htobe64(segs[j].ss_paddr);
  599                         } else {
  600                                 usgl->sge[i / 2].len[i & 1] =
  601                                     htobe32(segs[j].ss_len);
  602                                 usgl->sge[i / 2].addr[i & 1] =
  603                                     htobe64(segs[j].ss_paddr);
  604                         }
  605 #ifdef INVARIANTS
  606                         nsegs--;
  607 #endif
  608                 }
  609                 sglist_reset(&sg);
  610         }
  611         if (i & 1)
  612                 usgl->sge[i / 2].len[1] = htobe32(0);
  613         KASSERT(nsegs == 0, ("%s: nsegs %d, start %p, stop %p",
  614             __func__, nsegs, start, stop));
  615 }
  616 
  617 /*
  618  * Max number of SGL entries an offload tx work request can have.  This is 41
  619  * (1 + 40) for a full 512B work request.
  620  * fw_ofld_tx_data_wr(16B) + ulptx_sgl(16B, 1) + ulptx_sge_pair(480B, 40)
  621  */
  622 #define OFLD_SGL_LEN (41)
  623 
  624 /*
  625  * Send data and/or a FIN to the peer.
  626  *
  627  * The socket's so_snd buffer consists of a stream of data starting with sb_mb
  628  * and linked together with m_next.  sb_sndptr, if set, is the last mbuf that
  629  * was transmitted.
  630  *
  631  * drop indicates the number of bytes that should be dropped from the head of
  632  * the send buffer.  It is an optimization that lets do_fw4_ack avoid creating
  633  * contention on the send buffer lock (before this change it used to do
  634  * sowwakeup and then t4_push_frames right after that when recovering from tx
  635  * stalls).  When drop is set this function MUST drop the bytes and wake up any
  636  * writers.
  637  */
  638 void
  639 t4_push_frames(struct adapter *sc, struct toepcb *toep, int drop)
  640 {
  641         struct mbuf *sndptr, *m, *sb_sndptr;
  642         struct fw_ofld_tx_data_wr *txwr;
  643         struct wrqe *wr;
  644         u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
  645         struct inpcb *inp = toep->inp;
  646         struct tcpcb *tp = intotcpcb(inp);
  647         struct socket *so = inp->inp_socket;
  648         struct sockbuf *sb = &so->so_snd;
  649         int tx_credits, shove, compl, sowwakeup;
  650         struct ofld_tx_sdesc *txsd;
  651         bool nomap_mbuf_seen;
  652 
  653         INP_WLOCK_ASSERT(inp);
  654         KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
  655             ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
  656 
  657         KASSERT(ulp_mode(toep) == ULP_MODE_NONE ||
  658             ulp_mode(toep) == ULP_MODE_TCPDDP ||
  659             ulp_mode(toep) == ULP_MODE_TLS ||
  660             ulp_mode(toep) == ULP_MODE_RDMA,
  661             ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
  662 
  663 #ifdef VERBOSE_TRACES
  664         CTR5(KTR_CXGBE, "%s: tid %d toep flags %#x tp flags %#x drop %d",
  665             __func__, toep->tid, toep->flags, tp->t_flags, drop);
  666 #endif
  667         if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
  668                 return;
  669 
  670 #ifdef RATELIMIT
  671         if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) &&
  672             (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) {
  673                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
  674         }
  675 #endif
  676 
  677         /*
  678          * This function doesn't resume by itself.  Someone else must clear the
  679          * flag and call this function.
  680          */
  681         if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
  682                 KASSERT(drop == 0,
  683                     ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
  684                 return;
  685         }
  686 
  687         txsd = &toep->txsd[toep->txsd_pidx];
  688         do {
  689                 tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
  690                 max_imm = max_imm_payload(tx_credits, 0);
  691                 max_nsegs = max_dsgl_nsegs(tx_credits, 0);
  692 
  693                 SOCKBUF_LOCK(sb);
  694                 sowwakeup = drop;
  695                 if (drop) {
  696                         sbdrop_locked(sb, drop);
  697                         drop = 0;
  698                 }
  699                 sb_sndptr = sb->sb_sndptr;
  700                 sndptr = sb_sndptr ? sb_sndptr->m_next : sb->sb_mb;
  701                 plen = 0;
  702                 nsegs = 0;
  703                 max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
  704                 nomap_mbuf_seen = false;
  705                 for (m = sndptr; m != NULL; m = m->m_next) {
  706                         int n;
  707 
  708                         if ((m->m_flags & M_NOTAVAIL) != 0)
  709                                 break;
  710                         if (m->m_flags & M_EXTPG) {
  711 #ifdef KERN_TLS
  712                                 if (m->m_epg_tls != NULL) {
  713                                         toep->flags |= TPF_KTLS;
  714                                         if (plen == 0) {
  715                                                 SOCKBUF_UNLOCK(sb);
  716                                                 t4_push_ktls(sc, toep, 0);
  717                                                 return;
  718                                         }
  719                                         break;
  720                                 }
  721 #endif
  722                                 n = sglist_count_mbuf_epg(m,
  723                                     mtod(m, vm_offset_t), m->m_len);
  724                         } else
  725                                 n = sglist_count(mtod(m, void *), m->m_len);
  726 
  727                         nsegs += n;
  728                         plen += m->m_len;
  729 
  730                         /* This mbuf sent us _over_ the nsegs limit, back out */
  731                         if (plen > max_imm && nsegs > max_nsegs) {
  732                                 nsegs -= n;
  733                                 plen -= m->m_len;
  734                                 if (plen == 0) {
  735                                         /* Too few credits */
  736                                         toep->flags |= TPF_TX_SUSPENDED;
  737                                         if (sowwakeup) {
  738                                                 if (!TAILQ_EMPTY(
  739                                                     &toep->aiotx_jobq))
  740                                                         t4_aiotx_queue_toep(so,
  741                                                             toep);
  742                                                 sowwakeup_locked(so);
  743                                         } else
  744                                                 SOCKBUF_UNLOCK(sb);
  745                                         SOCKBUF_UNLOCK_ASSERT(sb);
  746                                         return;
  747                                 }
  748                                 break;
  749                         }
  750 
  751                         if (m->m_flags & M_EXTPG)
  752                                 nomap_mbuf_seen = true;
  753                         if (max_nsegs_1mbuf < n)
  754                                 max_nsegs_1mbuf = n;
  755                         sb_sndptr = m;  /* new sb->sb_sndptr if all goes well */
  756 
  757                         /* This mbuf put us right at the max_nsegs limit */
  758                         if (plen > max_imm && nsegs == max_nsegs) {
  759                                 m = m->m_next;
  760                                 break;
  761                         }
  762                 }
  763 
  764                 if (sbused(sb) > sb->sb_hiwat * 5 / 8 &&
  765                     toep->plen_nocompl + plen >= sb->sb_hiwat / 4)
  766                         compl = 1;
  767                 else
  768                         compl = 0;
  769 
  770                 if (sb->sb_flags & SB_AUTOSIZE &&
  771                     V_tcp_do_autosndbuf &&
  772                     sb->sb_hiwat < V_tcp_autosndbuf_max &&
  773                     sbused(sb) >= sb->sb_hiwat * 7 / 8) {
  774                         int newsize = min(sb->sb_hiwat + V_tcp_autosndbuf_inc,
  775                             V_tcp_autosndbuf_max);
  776 
  777                         if (!sbreserve_locked(so, SO_SND, newsize, NULL))
  778                                 sb->sb_flags &= ~SB_AUTOSIZE;
  779                         else
  780                                 sowwakeup = 1;  /* room available */
  781                 }
  782                 if (sowwakeup) {
  783                         if (!TAILQ_EMPTY(&toep->aiotx_jobq))
  784                                 t4_aiotx_queue_toep(so, toep);
  785                         sowwakeup_locked(so);
  786                 } else
  787                         SOCKBUF_UNLOCK(sb);
  788                 SOCKBUF_UNLOCK_ASSERT(sb);
  789 
  790                 /* nothing to send */
  791                 if (plen == 0) {
  792                         KASSERT(m == NULL || (m->m_flags & M_NOTAVAIL) != 0,
  793                             ("%s: nothing to send, but m != NULL is ready",
  794                             __func__));
  795                         break;
  796                 }
  797 
  798                 if (__predict_false(toep->flags & TPF_FIN_SENT))
  799                         panic("%s: excess tx.", __func__);
  800 
  801                 shove = m == NULL && !(tp->t_flags & TF_MORETOCOME);
  802                 if (plen <= max_imm && !nomap_mbuf_seen) {
  803 
  804                         /* Immediate data tx */
  805 
  806                         wr = alloc_wrqe(roundup2(sizeof(*txwr) + plen, 16),
  807                                         &toep->ofld_txq->wrq);
  808                         if (wr == NULL) {
  809                                 /* XXX: how will we recover from this? */
  810                                 toep->flags |= TPF_TX_SUSPENDED;
  811                                 return;
  812                         }
  813                         txwr = wrtod(wr);
  814                         credits = howmany(wr->wr_len, 16);
  815                         write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, plen, plen,
  816                             credits, shove, 0);
  817                         m_copydata(sndptr, 0, plen, (void *)(txwr + 1));
  818                         nsegs = 0;
  819                 } else {
  820                         int wr_len;
  821 
  822                         /* DSGL tx */
  823 
  824                         wr_len = sizeof(*txwr) + sizeof(struct ulptx_sgl) +
  825                             ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
  826                         wr = alloc_wrqe(roundup2(wr_len, 16),
  827                             &toep->ofld_txq->wrq);
  828                         if (wr == NULL) {
  829                                 /* XXX: how will we recover from this? */
  830                                 toep->flags |= TPF_TX_SUSPENDED;
  831                                 return;
  832                         }
  833                         txwr = wrtod(wr);
  834                         credits = howmany(wr_len, 16);
  835                         write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, 0, plen,
  836                             credits, shove, 0);
  837                         write_tx_sgl(txwr + 1, sndptr, m, nsegs,
  838                             max_nsegs_1mbuf);
  839                         if (wr_len & 0xf) {
  840                                 uint64_t *pad = (uint64_t *)
  841                                     ((uintptr_t)txwr + wr_len);
  842                                 *pad = 0;
  843                         }
  844                 }
  845 
  846                 KASSERT(toep->tx_credits >= credits,
  847                         ("%s: not enough credits", __func__));
  848 
  849                 toep->tx_credits -= credits;
  850                 toep->tx_nocompl += credits;
  851                 toep->plen_nocompl += plen;
  852                 if (toep->tx_credits <= toep->tx_total * 3 / 8 &&
  853                     toep->tx_nocompl >= toep->tx_total / 4)
  854                         compl = 1;
  855 
  856                 if (compl || ulp_mode(toep) == ULP_MODE_RDMA) {
  857                         txwr->op_to_immdlen |= htobe32(F_FW_WR_COMPL);
  858                         toep->tx_nocompl = 0;
  859                         toep->plen_nocompl = 0;
  860                 }
  861 
  862                 tp->snd_nxt += plen;
  863                 tp->snd_max += plen;
  864 
  865                 SOCKBUF_LOCK(sb);
  866                 KASSERT(sb_sndptr, ("%s: sb_sndptr is NULL", __func__));
  867                 sb->sb_sndptr = sb_sndptr;
  868                 SOCKBUF_UNLOCK(sb);
  869 
  870                 toep->flags |= TPF_TX_DATA_SENT;
  871                 if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
  872                         toep->flags |= TPF_TX_SUSPENDED;
  873 
  874                 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
  875                 txsd->plen = plen;
  876                 txsd->tx_credits = credits;
  877                 txsd++;
  878                 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
  879                         toep->txsd_pidx = 0;
  880                         txsd = &toep->txsd[0];
  881                 }
  882                 toep->txsd_avail--;
  883 
  884                 t4_l2t_send(sc, wr, toep->l2te);
  885         } while (m != NULL && (m->m_flags & M_NOTAVAIL) == 0);
  886 
  887         /* Send a FIN if requested, but only if there's no more data to send */
  888         if (m == NULL && toep->flags & TPF_SEND_FIN)
  889                 t4_close_conn(sc, toep);
  890 }
  891 
  892 static inline void
  893 rqdrop_locked(struct mbufq *q, int plen)
  894 {
  895         struct mbuf *m;
  896 
  897         while (plen > 0) {
  898                 m = mbufq_dequeue(q);
  899 
  900                 /* Too many credits. */
  901                 MPASS(m != NULL);
  902                 M_ASSERTPKTHDR(m);
  903 
  904                 /* Partial credits. */
  905                 MPASS(plen >= m->m_pkthdr.len);
  906 
  907                 plen -= m->m_pkthdr.len;
  908                 m_freem(m);
  909         }
  910 }
  911 
  912 /*
  913  * Not a bit in the TCB, but is a bit in the ulp_submode field of the
  914  * CPL_TX_DATA flags field in FW_ISCSI_TX_DATA_WR.
  915  */
  916 #define ULP_ISO         G_TX_ULP_SUBMODE(F_FW_ISCSI_TX_DATA_WR_ULPSUBMODE_ISO)
  917 
  918 static void
  919 write_tx_data_iso(void *dst, u_int ulp_submode, uint8_t flags, uint16_t mss,
  920     int len, int npdu)
  921 {
  922         struct cpl_tx_data_iso *cpl;
  923         unsigned int burst_size;
  924         unsigned int last;
  925 
  926         /*
  927          * The firmware will set the 'F' bit on the last PDU when
  928          * either condition is true:
  929          *
  930          * - this large PDU is marked as the "last" slice
  931          *
  932          * - the amount of data payload bytes equals the burst_size
  933          *
  934          * The strategy used here is to always set the burst_size
  935          * artificially high (len includes the size of the template
  936          * BHS) and only set the "last" flag if the original PDU had
  937          * 'F' set.
  938          */
  939         burst_size = len;
  940         last = !!(flags & CXGBE_ISO_F);
  941 
  942         cpl = (struct cpl_tx_data_iso *)dst;
  943         cpl->op_to_scsi = htonl(V_CPL_TX_DATA_ISO_OP(CPL_TX_DATA_ISO) |
  944             V_CPL_TX_DATA_ISO_FIRST(1) | V_CPL_TX_DATA_ISO_LAST(last) |
  945             V_CPL_TX_DATA_ISO_CPLHDRLEN(0) |
  946             V_CPL_TX_DATA_ISO_HDRCRC(!!(ulp_submode & ULP_CRC_HEADER)) |
  947             V_CPL_TX_DATA_ISO_PLDCRC(!!(ulp_submode & ULP_CRC_DATA)) |
  948             V_CPL_TX_DATA_ISO_IMMEDIATE(0) |
  949             V_CPL_TX_DATA_ISO_SCSI(CXGBE_ISO_TYPE(flags)));
  950 
  951         cpl->ahs_len = 0;
  952         cpl->mpdu = htons(DIV_ROUND_UP(mss, 4));
  953         cpl->burst_size = htonl(DIV_ROUND_UP(burst_size, 4));
  954         cpl->len = htonl(len);
  955         cpl->reserved2_seglen_offset = htonl(0);
  956         cpl->datasn_offset = htonl(0);
  957         cpl->buffer_offset = htonl(0);
  958         cpl->reserved3 = 0;
  959 }
  960 
  961 static struct wrqe *
  962 write_iscsi_mbuf_wr(struct toepcb *toep, struct mbuf *sndptr)
  963 {
  964         struct mbuf *m;
  965         struct fw_ofld_tx_data_wr *txwr;
  966         struct cpl_tx_data_iso *cpl_iso;
  967         void *p;
  968         struct wrqe *wr;
  969         u_int plen, nsegs, credits, max_imm, max_nsegs, max_nsegs_1mbuf;
  970         u_int adjusted_plen, imm_data, ulp_submode;
  971         struct inpcb *inp = toep->inp;
  972         struct tcpcb *tp = intotcpcb(inp);
  973         int tx_credits, shove, npdu, wr_len;
  974         uint16_t iso_mss;
  975         static const u_int ulp_extra_len[] = {0, 4, 4, 8};
  976         bool iso, nomap_mbuf_seen;
  977 
  978         M_ASSERTPKTHDR(sndptr);
  979 
  980         tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS);
  981         if (mbuf_raw_wr(sndptr)) {
  982                 plen = sndptr->m_pkthdr.len;
  983                 KASSERT(plen <= SGE_MAX_WR_LEN,
  984                     ("raw WR len %u is greater than max WR len", plen));
  985                 if (plen > tx_credits * 16)
  986                         return (NULL);
  987 
  988                 wr = alloc_wrqe(roundup2(plen, 16), &toep->ofld_txq->wrq);
  989                 if (__predict_false(wr == NULL))
  990                         return (NULL);
  991 
  992                 m_copydata(sndptr, 0, plen, wrtod(wr));
  993                 return (wr);
  994         }
  995 
  996         iso = mbuf_iscsi_iso(sndptr);
  997         max_imm = max_imm_payload(tx_credits, iso);
  998         max_nsegs = max_dsgl_nsegs(tx_credits, iso);
  999         iso_mss = mbuf_iscsi_iso_mss(sndptr);
 1000 
 1001         plen = 0;
 1002         nsegs = 0;
 1003         max_nsegs_1mbuf = 0; /* max # of SGL segments in any one mbuf */
 1004         nomap_mbuf_seen = false;
 1005         for (m = sndptr; m != NULL; m = m->m_next) {
 1006                 int n;
 1007 
 1008                 if (m->m_flags & M_EXTPG)
 1009                         n = sglist_count_mbuf_epg(m, mtod(m, vm_offset_t),
 1010                             m->m_len);
 1011                 else
 1012                         n = sglist_count(mtod(m, void *), m->m_len);
 1013 
 1014                 nsegs += n;
 1015                 plen += m->m_len;
 1016 
 1017                 /*
 1018                  * This mbuf would send us _over_ the nsegs limit.
 1019                  * Suspend tx because the PDU can't be sent out.
 1020                  */
 1021                 if ((nomap_mbuf_seen || plen > max_imm) && nsegs > max_nsegs)
 1022                         return (NULL);
 1023 
 1024                 if (m->m_flags & M_EXTPG)
 1025                         nomap_mbuf_seen = true;
 1026                 if (max_nsegs_1mbuf < n)
 1027                         max_nsegs_1mbuf = n;
 1028         }
 1029 
 1030         if (__predict_false(toep->flags & TPF_FIN_SENT))
 1031                 panic("%s: excess tx.", __func__);
 1032 
 1033         /*
 1034          * We have a PDU to send.  All of it goes out in one WR so 'm'
 1035          * is NULL.  A PDU's length is always a multiple of 4.
 1036          */
 1037         MPASS(m == NULL);
 1038         MPASS((plen & 3) == 0);
 1039         MPASS(sndptr->m_pkthdr.len == plen);
 1040 
 1041         shove = !(tp->t_flags & TF_MORETOCOME);
 1042 
 1043         /*
 1044          * plen doesn't include header and data digests, which are
 1045          * generated and inserted in the right places by the TOE, but
 1046          * they do occupy TCP sequence space and need to be accounted
 1047          * for.
 1048          */
 1049         ulp_submode = mbuf_ulp_submode(sndptr);
 1050         MPASS(ulp_submode < nitems(ulp_extra_len));
 1051         npdu = iso ? howmany(plen - ISCSI_BHS_SIZE, iso_mss) : 1;
 1052         adjusted_plen = plen + ulp_extra_len[ulp_submode] * npdu;
 1053         if (iso)
 1054                 adjusted_plen += ISCSI_BHS_SIZE * (npdu - 1);
 1055         wr_len = sizeof(*txwr);
 1056         if (iso)
 1057                 wr_len += sizeof(struct cpl_tx_data_iso);
 1058         if (plen <= max_imm && !nomap_mbuf_seen) {
 1059                 /* Immediate data tx */
 1060                 imm_data = plen;
 1061                 wr_len += plen;
 1062                 nsegs = 0;
 1063         } else {
 1064                 /* DSGL tx */
 1065                 imm_data = 0;
 1066                 wr_len += sizeof(struct ulptx_sgl) +
 1067                     ((3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1)) * 8;
 1068         }
 1069 
 1070         wr = alloc_wrqe(roundup2(wr_len, 16), &toep->ofld_txq->wrq);
 1071         if (wr == NULL) {
 1072                 /* XXX: how will we recover from this? */
 1073                 return (NULL);
 1074         }
 1075         txwr = wrtod(wr);
 1076         credits = howmany(wr->wr_len, 16);
 1077 
 1078         if (iso) {
 1079                 write_tx_wr(txwr, toep, FW_ISCSI_TX_DATA_WR,
 1080                     imm_data + sizeof(struct cpl_tx_data_iso),
 1081                     adjusted_plen, credits, shove, ulp_submode | ULP_ISO);
 1082                 cpl_iso = (struct cpl_tx_data_iso *)(txwr + 1);
 1083                 MPASS(plen == sndptr->m_pkthdr.len);
 1084                 write_tx_data_iso(cpl_iso, ulp_submode,
 1085                     mbuf_iscsi_iso_flags(sndptr), iso_mss, plen, npdu);
 1086                 p = cpl_iso + 1;
 1087         } else {
 1088                 write_tx_wr(txwr, toep, FW_OFLD_TX_DATA_WR, imm_data,
 1089                     adjusted_plen, credits, shove, ulp_submode);
 1090                 p = txwr + 1;
 1091         }
 1092 
 1093         if (imm_data != 0) {
 1094                 m_copydata(sndptr, 0, plen, p);
 1095         } else {
 1096                 write_tx_sgl(p, sndptr, m, nsegs, max_nsegs_1mbuf);
 1097                 if (wr_len & 0xf) {
 1098                         uint64_t *pad = (uint64_t *)((uintptr_t)txwr + wr_len);
 1099                         *pad = 0;
 1100                 }
 1101         }
 1102 
 1103         KASSERT(toep->tx_credits >= credits,
 1104             ("%s: not enough credits: credits %u "
 1105                 "toep->tx_credits %u tx_credits %u nsegs %u "
 1106                 "max_nsegs %u iso %d", __func__, credits,
 1107                 toep->tx_credits, tx_credits, nsegs, max_nsegs, iso));
 1108 
 1109         tp->snd_nxt += adjusted_plen;
 1110         tp->snd_max += adjusted_plen;
 1111 
 1112         counter_u64_add(toep->ofld_txq->tx_iscsi_pdus, npdu);
 1113         counter_u64_add(toep->ofld_txq->tx_iscsi_octets, plen);
 1114         if (iso)
 1115                 counter_u64_add(toep->ofld_txq->tx_iscsi_iso_wrs, 1);
 1116 
 1117         return (wr);
 1118 }
 1119 
 1120 void
 1121 t4_push_pdus(struct adapter *sc, struct toepcb *toep, int drop)
 1122 {
 1123         struct mbuf *sndptr, *m;
 1124         struct fw_wr_hdr *wrhdr;
 1125         struct wrqe *wr;
 1126         u_int plen, credits;
 1127         struct inpcb *inp = toep->inp;
 1128         struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx];
 1129         struct mbufq *pduq = &toep->ulp_pduq;
 1130 
 1131         INP_WLOCK_ASSERT(inp);
 1132         KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 1133             ("%s: flowc_wr not sent for tid %u.", __func__, toep->tid));
 1134         KASSERT(ulp_mode(toep) == ULP_MODE_ISCSI,
 1135             ("%s: ulp_mode %u for toep %p", __func__, ulp_mode(toep), toep));
 1136 
 1137         if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN))
 1138                 return;
 1139 
 1140         /*
 1141          * This function doesn't resume by itself.  Someone else must clear the
 1142          * flag and call this function.
 1143          */
 1144         if (__predict_false(toep->flags & TPF_TX_SUSPENDED)) {
 1145                 KASSERT(drop == 0,
 1146                     ("%s: drop (%d) != 0 but tx is suspended", __func__, drop));
 1147                 return;
 1148         }
 1149 
 1150         if (drop) {
 1151                 struct socket *so = inp->inp_socket;
 1152                 struct sockbuf *sb = &so->so_snd;
 1153                 int sbu;
 1154 
 1155                 /*
 1156                  * An unlocked read is ok here as the data should only
 1157                  * transition from a non-zero value to either another
 1158                  * non-zero value or zero.  Once it is zero it should
 1159                  * stay zero.
 1160                  */
 1161                 if (__predict_false(sbused(sb)) > 0) {
 1162                         SOCKBUF_LOCK(sb);
 1163                         sbu = sbused(sb);
 1164                         if (sbu > 0) {
 1165                                 /*
 1166                                  * The data transmitted before the
 1167                                  * tid's ULP mode changed to ISCSI is
 1168                                  * still in so_snd.  Incoming credits
 1169                                  * should account for so_snd first.
 1170                                  */
 1171                                 sbdrop_locked(sb, min(sbu, drop));
 1172                                 drop -= min(sbu, drop);
 1173                         }
 1174                         sowwakeup_locked(so);   /* unlocks so_snd */
 1175                 }
 1176                 rqdrop_locked(&toep->ulp_pdu_reclaimq, drop);
 1177         }
 1178 
 1179         while ((sndptr = mbufq_first(pduq)) != NULL) {
 1180                 wr = write_iscsi_mbuf_wr(toep, sndptr);
 1181                 if (wr == NULL) {
 1182                         toep->flags |= TPF_TX_SUSPENDED;
 1183                         return;
 1184                 }
 1185 
 1186                 plen = sndptr->m_pkthdr.len;
 1187                 credits = howmany(wr->wr_len, 16);
 1188                 KASSERT(toep->tx_credits >= credits,
 1189                         ("%s: not enough credits", __func__));
 1190 
 1191                 m = mbufq_dequeue(pduq);
 1192                 MPASS(m == sndptr);
 1193                 mbufq_enqueue(&toep->ulp_pdu_reclaimq, m);
 1194 
 1195                 toep->tx_credits -= credits;
 1196                 toep->tx_nocompl += credits;
 1197                 toep->plen_nocompl += plen;
 1198 
 1199                 /*
 1200                  * Ensure there are enough credits for a full-sized WR
 1201                  * as page pod WRs can be full-sized.
 1202                  */
 1203                 if (toep->tx_credits <= SGE_MAX_WR_LEN * 5 / 4 &&
 1204                     toep->tx_nocompl >= toep->tx_total / 4) {
 1205                         wrhdr = wrtod(wr);
 1206                         wrhdr->hi |= htobe32(F_FW_WR_COMPL);
 1207                         toep->tx_nocompl = 0;
 1208                         toep->plen_nocompl = 0;
 1209                 }
 1210 
 1211                 toep->flags |= TPF_TX_DATA_SENT;
 1212                 if (toep->tx_credits < MIN_OFLD_TX_CREDITS)
 1213                         toep->flags |= TPF_TX_SUSPENDED;
 1214 
 1215                 KASSERT(toep->txsd_avail > 0, ("%s: no txsd", __func__));
 1216                 txsd->plen = plen;
 1217                 txsd->tx_credits = credits;
 1218                 txsd++;
 1219                 if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) {
 1220                         toep->txsd_pidx = 0;
 1221                         txsd = &toep->txsd[0];
 1222                 }
 1223                 toep->txsd_avail--;
 1224 
 1225                 t4_l2t_send(sc, wr, toep->l2te);
 1226         }
 1227 
 1228         /* Send a FIN if requested, but only if there are no more PDUs to send */
 1229         if (mbufq_first(pduq) == NULL && toep->flags & TPF_SEND_FIN)
 1230                 t4_close_conn(sc, toep);
 1231 }
 1232 
 1233 static inline void
 1234 t4_push_data(struct adapter *sc, struct toepcb *toep, int drop)
 1235 {
 1236 
 1237         if (ulp_mode(toep) == ULP_MODE_ISCSI)
 1238                 t4_push_pdus(sc, toep, drop);
 1239         else if (toep->flags & TPF_KTLS)
 1240                 t4_push_ktls(sc, toep, drop);
 1241         else
 1242                 t4_push_frames(sc, toep, drop);
 1243 }
 1244 
 1245 int
 1246 t4_tod_output(struct toedev *tod, struct tcpcb *tp)
 1247 {
 1248         struct adapter *sc = tod->tod_softc;
 1249 #ifdef INVARIANTS
 1250         struct inpcb *inp = tptoinpcb(tp);
 1251 #endif
 1252         struct toepcb *toep = tp->t_toe;
 1253 
 1254         INP_WLOCK_ASSERT(inp);
 1255         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 1256             ("%s: inp %p dropped.", __func__, inp));
 1257         KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 1258 
 1259         t4_push_data(sc, toep, 0);
 1260 
 1261         return (0);
 1262 }
 1263 
 1264 int
 1265 t4_send_fin(struct toedev *tod, struct tcpcb *tp)
 1266 {
 1267         struct adapter *sc = tod->tod_softc;
 1268 #ifdef INVARIANTS
 1269         struct inpcb *inp = tptoinpcb(tp);
 1270 #endif
 1271         struct toepcb *toep = tp->t_toe;
 1272 
 1273         INP_WLOCK_ASSERT(inp);
 1274         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 1275             ("%s: inp %p dropped.", __func__, inp));
 1276         KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 1277 
 1278         toep->flags |= TPF_SEND_FIN;
 1279         if (tp->t_state >= TCPS_ESTABLISHED)
 1280                 t4_push_data(sc, toep, 0);
 1281 
 1282         return (0);
 1283 }
 1284 
 1285 int
 1286 t4_send_rst(struct toedev *tod, struct tcpcb *tp)
 1287 {
 1288         struct adapter *sc = tod->tod_softc;
 1289 #if defined(INVARIANTS)
 1290         struct inpcb *inp = tptoinpcb(tp);
 1291 #endif
 1292         struct toepcb *toep = tp->t_toe;
 1293 
 1294         INP_WLOCK_ASSERT(inp);
 1295         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 1296             ("%s: inp %p dropped.", __func__, inp));
 1297         KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 1298 
 1299         /* hmmmm */
 1300         KASSERT(toep->flags & TPF_FLOWC_WR_SENT,
 1301             ("%s: flowc for tid %u [%s] not sent already",
 1302             __func__, toep->tid, tcpstates[tp->t_state]));
 1303 
 1304         send_reset(sc, toep, 0);
 1305         return (0);
 1306 }
 1307 
 1308 /*
 1309  * Peer has sent us a FIN.
 1310  */
 1311 static int
 1312 do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 1313 {
 1314         struct adapter *sc = iq->adapter;
 1315         const struct cpl_peer_close *cpl = (const void *)(rss + 1);
 1316         unsigned int tid = GET_TID(cpl);
 1317         struct toepcb *toep = lookup_tid(sc, tid);
 1318         struct inpcb *inp = toep->inp;
 1319         struct tcpcb *tp = NULL;
 1320         struct socket *so;
 1321         struct epoch_tracker et;
 1322 #ifdef INVARIANTS
 1323         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1324 #endif
 1325 
 1326         KASSERT(opcode == CPL_PEER_CLOSE,
 1327             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1328         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1329 
 1330         if (__predict_false(toep->flags & TPF_SYNQE)) {
 1331                 /*
 1332                  * do_pass_establish must have run before do_peer_close and if
 1333                  * this is still a synqe instead of a toepcb then the connection
 1334                  * must be getting aborted.
 1335                  */
 1336                 MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 1337                 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 1338                     toep, toep->flags);
 1339                 return (0);
 1340         }
 1341 
 1342         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1343 
 1344         CURVNET_SET(toep->vnet);
 1345         NET_EPOCH_ENTER(et);
 1346         INP_WLOCK(inp);
 1347         tp = intotcpcb(inp);
 1348 
 1349         CTR6(KTR_CXGBE,
 1350             "%s: tid %u (%s), toep_flags 0x%x, ddp_flags 0x%x, inp %p",
 1351             __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 1352             toep->ddp.flags, inp);
 1353 
 1354         if (toep->flags & TPF_ABORT_SHUTDOWN)
 1355                 goto done;
 1356 
 1357         so = inp->inp_socket;
 1358         socantrcvmore(so);
 1359         if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 1360                 DDP_LOCK(toep);
 1361                 if (__predict_false(toep->ddp.flags &
 1362                     (DDP_BUF0_ACTIVE | DDP_BUF1_ACTIVE)))
 1363                         handle_ddp_close(toep, tp, cpl->rcv_nxt);
 1364                 DDP_UNLOCK(toep);
 1365         }
 1366 
 1367         if (ulp_mode(toep) == ULP_MODE_RDMA ||
 1368             (ulp_mode(toep) == ULP_MODE_ISCSI && chip_id(sc) >= CHELSIO_T6)) {
 1369                 /*
 1370                  * There might be data received via DDP before the FIN
 1371                  * not reported to the driver.  Just assume the
 1372                  * sequence number in the CPL is correct as the
 1373                  * sequence number of the FIN.
 1374                  */
 1375         } else {
 1376                 KASSERT(tp->rcv_nxt + 1 == be32toh(cpl->rcv_nxt),
 1377                     ("%s: rcv_nxt mismatch: %u %u", __func__, tp->rcv_nxt,
 1378                     be32toh(cpl->rcv_nxt)));
 1379         }
 1380 
 1381         tp->rcv_nxt = be32toh(cpl->rcv_nxt);
 1382 
 1383         switch (tp->t_state) {
 1384         case TCPS_SYN_RECEIVED:
 1385                 tp->t_starttime = ticks;
 1386                 /* FALLTHROUGH */
 1387 
 1388         case TCPS_ESTABLISHED:
 1389                 tcp_state_change(tp, TCPS_CLOSE_WAIT);
 1390                 break;
 1391 
 1392         case TCPS_FIN_WAIT_1:
 1393                 tcp_state_change(tp, TCPS_CLOSING);
 1394                 break;
 1395 
 1396         case TCPS_FIN_WAIT_2:
 1397                 restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 1398                 tcp_twstart(tp);
 1399                 INP_UNLOCK_ASSERT(inp);  /* safe, we have a ref on the inp */
 1400                 NET_EPOCH_EXIT(et);
 1401                 CURVNET_RESTORE();
 1402 
 1403                 INP_WLOCK(inp);
 1404                 final_cpl_received(toep);
 1405                 return (0);
 1406 
 1407         default:
 1408                 log(LOG_ERR, "%s: TID %u received CPL_PEER_CLOSE in state %d\n",
 1409                     __func__, tid, tp->t_state);
 1410         }
 1411 done:
 1412         INP_WUNLOCK(inp);
 1413         NET_EPOCH_EXIT(et);
 1414         CURVNET_RESTORE();
 1415         return (0);
 1416 }
 1417 
 1418 /*
 1419  * Peer has ACK'd our FIN.
 1420  */
 1421 static int
 1422 do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss,
 1423     struct mbuf *m)
 1424 {
 1425         struct adapter *sc = iq->adapter;
 1426         const struct cpl_close_con_rpl *cpl = (const void *)(rss + 1);
 1427         unsigned int tid = GET_TID(cpl);
 1428         struct toepcb *toep = lookup_tid(sc, tid);
 1429         struct inpcb *inp = toep->inp;
 1430         struct tcpcb *tp = NULL;
 1431         struct socket *so = NULL;
 1432         struct epoch_tracker et;
 1433 #ifdef INVARIANTS
 1434         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1435 #endif
 1436 
 1437         KASSERT(opcode == CPL_CLOSE_CON_RPL,
 1438             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1439         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1440         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1441 
 1442         CURVNET_SET(toep->vnet);
 1443         NET_EPOCH_ENTER(et);
 1444         INP_WLOCK(inp);
 1445         tp = intotcpcb(inp);
 1446 
 1447         CTR4(KTR_CXGBE, "%s: tid %u (%s), toep_flags 0x%x",
 1448             __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags);
 1449 
 1450         if (toep->flags & TPF_ABORT_SHUTDOWN)
 1451                 goto done;
 1452 
 1453         so = inp->inp_socket;
 1454         tp->snd_una = be32toh(cpl->snd_nxt) - 1;        /* exclude FIN */
 1455 
 1456         switch (tp->t_state) {
 1457         case TCPS_CLOSING:      /* see TCPS_FIN_WAIT_2 in do_peer_close too */
 1458                 restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 1459                 tcp_twstart(tp);
 1460 release:
 1461                 INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the  inp */
 1462                 NET_EPOCH_EXIT(et);
 1463                 CURVNET_RESTORE();
 1464 
 1465                 INP_WLOCK(inp);
 1466                 final_cpl_received(toep);       /* no more CPLs expected */
 1467 
 1468                 return (0);
 1469         case TCPS_LAST_ACK:
 1470                 if (tcp_close(tp))
 1471                         INP_WUNLOCK(inp);
 1472                 goto release;
 1473 
 1474         case TCPS_FIN_WAIT_1:
 1475                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 1476                         soisdisconnected(so);
 1477                 tcp_state_change(tp, TCPS_FIN_WAIT_2);
 1478                 break;
 1479 
 1480         default:
 1481                 log(LOG_ERR,
 1482                     "%s: TID %u received CPL_CLOSE_CON_RPL in state %s\n",
 1483                     __func__, tid, tcpstates[tp->t_state]);
 1484         }
 1485 done:
 1486         INP_WUNLOCK(inp);
 1487         NET_EPOCH_EXIT(et);
 1488         CURVNET_RESTORE();
 1489         return (0);
 1490 }
 1491 
 1492 void
 1493 send_abort_rpl(struct adapter *sc, struct sge_ofld_txq *ofld_txq, int tid,
 1494     int rst_status)
 1495 {
 1496         struct wrqe *wr;
 1497         struct cpl_abort_rpl *cpl;
 1498 
 1499         wr = alloc_wrqe(sizeof(*cpl), &ofld_txq->wrq);
 1500         if (wr == NULL) {
 1501                 /* XXX */
 1502                 panic("%s: allocation failure.", __func__);
 1503         }
 1504         cpl = wrtod(wr);
 1505 
 1506         INIT_TP_WR_MIT_CPL(cpl, CPL_ABORT_RPL, tid);
 1507         cpl->cmd = rst_status;
 1508 
 1509         t4_wrq_tx(sc, wr);
 1510 }
 1511 
 1512 static int
 1513 abort_status_to_errno(struct tcpcb *tp, unsigned int abort_reason)
 1514 {
 1515         switch (abort_reason) {
 1516         case CPL_ERR_BAD_SYN:
 1517         case CPL_ERR_CONN_RESET:
 1518                 return (tp->t_state == TCPS_CLOSE_WAIT ? EPIPE : ECONNRESET);
 1519         case CPL_ERR_XMIT_TIMEDOUT:
 1520         case CPL_ERR_PERSIST_TIMEDOUT:
 1521         case CPL_ERR_FINWAIT2_TIMEDOUT:
 1522         case CPL_ERR_KEEPALIVE_TIMEDOUT:
 1523                 return (ETIMEDOUT);
 1524         default:
 1525                 return (EIO);
 1526         }
 1527 }
 1528 
 1529 /*
 1530  * TCP RST from the peer, timeout, or some other such critical error.
 1531  */
 1532 static int
 1533 do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 1534 {
 1535         struct adapter *sc = iq->adapter;
 1536         const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
 1537         unsigned int tid = GET_TID(cpl);
 1538         struct toepcb *toep = lookup_tid(sc, tid);
 1539         struct sge_ofld_txq *ofld_txq = toep->ofld_txq;
 1540         struct inpcb *inp;
 1541         struct tcpcb *tp;
 1542         struct epoch_tracker et;
 1543 #ifdef INVARIANTS
 1544         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1545 #endif
 1546 
 1547         KASSERT(opcode == CPL_ABORT_REQ_RSS,
 1548             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1549         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1550 
 1551         if (toep->flags & TPF_SYNQE)
 1552                 return (do_abort_req_synqe(iq, rss, m));
 1553 
 1554         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1555 
 1556         if (negative_advice(cpl->status)) {
 1557                 CTR4(KTR_CXGBE, "%s: negative advice %d for tid %d (0x%x)",
 1558                     __func__, cpl->status, tid, toep->flags);
 1559                 return (0);     /* Ignore negative advice */
 1560         }
 1561 
 1562         inp = toep->inp;
 1563         CURVNET_SET(toep->vnet);
 1564         NET_EPOCH_ENTER(et);    /* for tcp_close */
 1565         INP_WLOCK(inp);
 1566 
 1567         tp = intotcpcb(inp);
 1568 
 1569         CTR6(KTR_CXGBE,
 1570             "%s: tid %d (%s), toep_flags 0x%x, inp_flags 0x%x, status %d",
 1571             __func__, tid, tp ? tcpstates[tp->t_state] : "no tp", toep->flags,
 1572             inp->inp_flags, cpl->status);
 1573 
 1574         /*
 1575          * If we'd initiated an abort earlier the reply to it is responsible for
 1576          * cleaning up resources.  Otherwise we tear everything down right here
 1577          * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
 1578          */
 1579         if (toep->flags & TPF_ABORT_SHUTDOWN) {
 1580                 INP_WUNLOCK(inp);
 1581                 goto done;
 1582         }
 1583         toep->flags |= TPF_ABORT_SHUTDOWN;
 1584 
 1585         if ((inp->inp_flags & INP_DROPPED) == 0) {
 1586                 struct socket *so = inp->inp_socket;
 1587 
 1588                 if (so != NULL)
 1589                         so_error_set(so, abort_status_to_errno(tp,
 1590                             cpl->status));
 1591                 tp = tcp_close(tp);
 1592                 if (tp == NULL)
 1593                         INP_WLOCK(inp); /* re-acquire */
 1594         }
 1595 
 1596         final_cpl_received(toep);
 1597 done:
 1598         NET_EPOCH_EXIT(et);
 1599         CURVNET_RESTORE();
 1600         send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
 1601         return (0);
 1602 }
 1603 
 1604 /*
 1605  * Reply to the CPL_ABORT_REQ (send_reset)
 1606  */
 1607 static int
 1608 do_abort_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 1609 {
 1610         struct adapter *sc = iq->adapter;
 1611         const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
 1612         unsigned int tid = GET_TID(cpl);
 1613         struct toepcb *toep = lookup_tid(sc, tid);
 1614         struct inpcb *inp = toep->inp;
 1615 #ifdef INVARIANTS
 1616         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1617 #endif
 1618 
 1619         KASSERT(opcode == CPL_ABORT_RPL_RSS,
 1620             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1621         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1622 
 1623         if (toep->flags & TPF_SYNQE)
 1624                 return (do_abort_rpl_synqe(iq, rss, m));
 1625 
 1626         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1627 
 1628         CTR5(KTR_CXGBE, "%s: tid %u, toep %p, inp %p, status %d",
 1629             __func__, tid, toep, inp, cpl->status);
 1630 
 1631         KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 1632             ("%s: wasn't expecting abort reply", __func__));
 1633 
 1634         INP_WLOCK(inp);
 1635         final_cpl_received(toep);
 1636 
 1637         return (0);
 1638 }
 1639 
 1640 static int
 1641 do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 1642 {
 1643         struct adapter *sc = iq->adapter;
 1644         const struct cpl_rx_data *cpl = mtod(m, const void *);
 1645         unsigned int tid = GET_TID(cpl);
 1646         struct toepcb *toep = lookup_tid(sc, tid);
 1647         struct inpcb *inp = toep->inp;
 1648         struct tcpcb *tp;
 1649         struct socket *so;
 1650         struct sockbuf *sb;
 1651         struct epoch_tracker et;
 1652         int len, rx_credits;
 1653         uint32_t ddp_placed = 0;
 1654 
 1655         if (__predict_false(toep->flags & TPF_SYNQE)) {
 1656                 /*
 1657                  * do_pass_establish must have run before do_rx_data and if this
 1658                  * is still a synqe instead of a toepcb then the connection must
 1659                  * be getting aborted.
 1660                  */
 1661                 MPASS(toep->flags & TPF_ABORT_SHUTDOWN);
 1662                 CTR4(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x)", __func__, tid,
 1663                     toep, toep->flags);
 1664                 m_freem(m);
 1665                 return (0);
 1666         }
 1667 
 1668         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1669 
 1670         /* strip off CPL header */
 1671         m_adj(m, sizeof(*cpl));
 1672         len = m->m_pkthdr.len;
 1673 
 1674         INP_WLOCK(inp);
 1675         if (inp->inp_flags & INP_DROPPED) {
 1676                 CTR4(KTR_CXGBE, "%s: tid %u, rx (%d bytes), inp_flags 0x%x",
 1677                     __func__, tid, len, inp->inp_flags);
 1678                 INP_WUNLOCK(inp);
 1679                 m_freem(m);
 1680                 return (0);
 1681         }
 1682 
 1683         tp = intotcpcb(inp);
 1684 
 1685         if (__predict_false(ulp_mode(toep) == ULP_MODE_TLS &&
 1686            toep->flags & TPF_TLS_RECEIVE)) {
 1687                 /* Received "raw" data on a TLS socket. */
 1688                 CTR3(KTR_CXGBE, "%s: tid %u, raw TLS data (%d bytes)",
 1689                     __func__, tid, len);
 1690                 do_rx_data_tls(cpl, toep, m);
 1691                 return (0);
 1692         }
 1693 
 1694         if (__predict_false(tp->rcv_nxt != be32toh(cpl->seq)))
 1695                 ddp_placed = be32toh(cpl->seq) - tp->rcv_nxt;
 1696 
 1697         tp->rcv_nxt += len;
 1698         if (tp->rcv_wnd < len) {
 1699                 KASSERT(ulp_mode(toep) == ULP_MODE_RDMA,
 1700                                 ("%s: negative window size", __func__));
 1701         }
 1702 
 1703         tp->rcv_wnd -= len;
 1704         tp->t_rcvtime = ticks;
 1705 
 1706         if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 1707                 DDP_LOCK(toep);
 1708         so = inp_inpcbtosocket(inp);
 1709         sb = &so->so_rcv;
 1710         SOCKBUF_LOCK(sb);
 1711 
 1712         if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
 1713                 CTR3(KTR_CXGBE, "%s: tid %u, excess rx (%d bytes)",
 1714                     __func__, tid, len);
 1715                 m_freem(m);
 1716                 SOCKBUF_UNLOCK(sb);
 1717                 if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 1718                         DDP_UNLOCK(toep);
 1719                 INP_WUNLOCK(inp);
 1720 
 1721                 CURVNET_SET(toep->vnet);
 1722                 NET_EPOCH_ENTER(et);
 1723                 INP_WLOCK(inp);
 1724                 tp = tcp_drop(tp, ECONNRESET);
 1725                 if (tp)
 1726                         INP_WUNLOCK(inp);
 1727                 NET_EPOCH_EXIT(et);
 1728                 CURVNET_RESTORE();
 1729 
 1730                 return (0);
 1731         }
 1732 
 1733         /* receive buffer autosize */
 1734         MPASS(toep->vnet == so->so_vnet);
 1735         CURVNET_SET(toep->vnet);
 1736         if (sb->sb_flags & SB_AUTOSIZE &&
 1737             V_tcp_do_autorcvbuf &&
 1738             sb->sb_hiwat < V_tcp_autorcvbuf_max &&
 1739             len > (sbspace(sb) / 8 * 7)) {
 1740                 unsigned int hiwat = sb->sb_hiwat;
 1741                 unsigned int newsize = min(hiwat + sc->tt.autorcvbuf_inc,
 1742                     V_tcp_autorcvbuf_max);
 1743 
 1744                 if (!sbreserve_locked(so, SO_RCV, newsize, NULL))
 1745                         sb->sb_flags &= ~SB_AUTOSIZE;
 1746         }
 1747 
 1748         if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 1749                 int changed = !(toep->ddp.flags & DDP_ON) ^ cpl->ddp_off;
 1750 
 1751                 if (toep->ddp.waiting_count != 0 || toep->ddp.active_count != 0)
 1752                         CTR3(KTR_CXGBE, "%s: tid %u, non-ddp rx (%d bytes)",
 1753                             __func__, tid, len);
 1754 
 1755                 if (changed) {
 1756                         if (toep->ddp.flags & DDP_SC_REQ)
 1757                                 toep->ddp.flags ^= DDP_ON | DDP_SC_REQ;
 1758                         else {
 1759                                 KASSERT(cpl->ddp_off == 1,
 1760                                     ("%s: DDP switched on by itself.",
 1761                                     __func__));
 1762 
 1763                                 /* Fell out of DDP mode */
 1764                                 toep->ddp.flags &= ~DDP_ON;
 1765                                 CTR1(KTR_CXGBE, "%s: fell out of DDP mode",
 1766                                     __func__);
 1767 
 1768                                 insert_ddp_data(toep, ddp_placed);
 1769                         }
 1770                 }
 1771 
 1772                 if (toep->ddp.flags & DDP_ON) {
 1773                         /*
 1774                          * CPL_RX_DATA with DDP on can only be an indicate.
 1775                          * Start posting queued AIO requests via DDP.  The
 1776                          * payload that arrived in this indicate is appended
 1777                          * to the socket buffer as usual.
 1778                          */
 1779                         handle_ddp_indicate(toep);
 1780                 }
 1781         }
 1782 
 1783         sbappendstream_locked(sb, m, 0);
 1784         rx_credits = sbspace(sb) > tp->rcv_wnd ? sbspace(sb) - tp->rcv_wnd : 0;
 1785         if (rx_credits > 0 && sbused(sb) + tp->rcv_wnd < sb->sb_lowat) {
 1786                 rx_credits = send_rx_credits(sc, toep, rx_credits);
 1787                 tp->rcv_wnd += rx_credits;
 1788                 tp->rcv_adv += rx_credits;
 1789         }
 1790 
 1791         if (ulp_mode(toep) == ULP_MODE_TCPDDP && toep->ddp.waiting_count > 0 &&
 1792             sbavail(sb) != 0) {
 1793                 CTR2(KTR_CXGBE, "%s: tid %u queueing AIO task", __func__,
 1794                     tid);
 1795                 ddp_queue_toep(toep);
 1796         }
 1797         if (toep->flags & TPF_TLS_STARTING)
 1798                 tls_received_starting_data(sc, toep, sb, len);
 1799         sorwakeup_locked(so);
 1800         SOCKBUF_UNLOCK_ASSERT(sb);
 1801         if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 1802                 DDP_UNLOCK(toep);
 1803 
 1804         INP_WUNLOCK(inp);
 1805         CURVNET_RESTORE();
 1806         return (0);
 1807 }
 1808 
 1809 static int
 1810 do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 1811 {
 1812         struct adapter *sc = iq->adapter;
 1813         const struct cpl_fw4_ack *cpl = (const void *)(rss + 1);
 1814         unsigned int tid = G_CPL_FW4_ACK_FLOWID(be32toh(OPCODE_TID(cpl)));
 1815         struct toepcb *toep = lookup_tid(sc, tid);
 1816         struct inpcb *inp;
 1817         struct tcpcb *tp;
 1818         struct socket *so;
 1819         uint8_t credits = cpl->credits;
 1820         struct ofld_tx_sdesc *txsd;
 1821         int plen;
 1822 #ifdef INVARIANTS
 1823         unsigned int opcode = G_CPL_FW4_ACK_OPCODE(be32toh(OPCODE_TID(cpl)));
 1824 #endif
 1825 
 1826         /*
 1827          * Very unusual case: we'd sent a flowc + abort_req for a synq entry and
 1828          * now this comes back carrying the credits for the flowc.
 1829          */
 1830         if (__predict_false(toep->flags & TPF_SYNQE)) {
 1831                 KASSERT(toep->flags & TPF_ABORT_SHUTDOWN,
 1832                     ("%s: credits for a synq entry %p", __func__, toep));
 1833                 return (0);
 1834         }
 1835 
 1836         inp = toep->inp;
 1837 
 1838         KASSERT(opcode == CPL_FW4_ACK,
 1839             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1840         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1841         KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__));
 1842 
 1843         INP_WLOCK(inp);
 1844 
 1845         if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) {
 1846                 INP_WUNLOCK(inp);
 1847                 return (0);
 1848         }
 1849 
 1850         KASSERT((inp->inp_flags & INP_DROPPED) == 0,
 1851             ("%s: inp_flags 0x%x", __func__, inp->inp_flags));
 1852 
 1853         tp = intotcpcb(inp);
 1854 
 1855         if (cpl->flags & CPL_FW4_ACK_FLAGS_SEQVAL) {
 1856                 tcp_seq snd_una = be32toh(cpl->snd_una);
 1857 
 1858 #ifdef INVARIANTS
 1859                 if (__predict_false(SEQ_LT(snd_una, tp->snd_una))) {
 1860                         log(LOG_ERR,
 1861                             "%s: unexpected seq# %x for TID %u, snd_una %x\n",
 1862                             __func__, snd_una, toep->tid, tp->snd_una);
 1863                 }
 1864 #endif
 1865 
 1866                 if (tp->snd_una != snd_una) {
 1867                         tp->snd_una = snd_una;
 1868                         tp->ts_recent_age = tcp_ts_getticks();
 1869                 }
 1870         }
 1871 
 1872 #ifdef VERBOSE_TRACES
 1873         CTR3(KTR_CXGBE, "%s: tid %d credits %u", __func__, tid, credits);
 1874 #endif
 1875         so = inp->inp_socket;
 1876         txsd = &toep->txsd[toep->txsd_cidx];
 1877         plen = 0;
 1878         while (credits) {
 1879                 KASSERT(credits >= txsd->tx_credits,
 1880                     ("%s: too many (or partial) credits", __func__));
 1881                 credits -= txsd->tx_credits;
 1882                 toep->tx_credits += txsd->tx_credits;
 1883                 plen += txsd->plen;
 1884                 txsd++;
 1885                 toep->txsd_avail++;
 1886                 KASSERT(toep->txsd_avail <= toep->txsd_total,
 1887                     ("%s: txsd avail > total", __func__));
 1888                 if (__predict_false(++toep->txsd_cidx == toep->txsd_total)) {
 1889                         txsd = &toep->txsd[0];
 1890                         toep->txsd_cidx = 0;
 1891                 }
 1892         }
 1893 
 1894         if (toep->tx_credits == toep->tx_total) {
 1895                 toep->tx_nocompl = 0;
 1896                 toep->plen_nocompl = 0;
 1897         }
 1898 
 1899         if (toep->flags & TPF_TX_SUSPENDED &&
 1900             toep->tx_credits >= toep->tx_total / 4) {
 1901 #ifdef VERBOSE_TRACES
 1902                 CTR2(KTR_CXGBE, "%s: tid %d calling t4_push_frames", __func__,
 1903                     tid);
 1904 #endif
 1905                 toep->flags &= ~TPF_TX_SUSPENDED;
 1906                 CURVNET_SET(toep->vnet);
 1907                 t4_push_data(sc, toep, plen);
 1908                 CURVNET_RESTORE();
 1909         } else if (plen > 0) {
 1910                 struct sockbuf *sb = &so->so_snd;
 1911                 int sbu;
 1912 
 1913                 SOCKBUF_LOCK(sb);
 1914                 sbu = sbused(sb);
 1915                 if (ulp_mode(toep) == ULP_MODE_ISCSI) {
 1916                         if (__predict_false(sbu > 0)) {
 1917                                 /*
 1918                                  * The data transmitted before the
 1919                                  * tid's ULP mode changed to ISCSI is
 1920                                  * still in so_snd.  Incoming credits
 1921                                  * should account for so_snd first.
 1922                                  */
 1923                                 sbdrop_locked(sb, min(sbu, plen));
 1924                                 plen -= min(sbu, plen);
 1925                         }
 1926                         sowwakeup_locked(so);   /* unlocks so_snd */
 1927                         rqdrop_locked(&toep->ulp_pdu_reclaimq, plen);
 1928                 } else {
 1929 #ifdef VERBOSE_TRACES
 1930                         CTR3(KTR_CXGBE, "%s: tid %d dropped %d bytes", __func__,
 1931                             tid, plen);
 1932 #endif
 1933                         sbdrop_locked(sb, plen);
 1934                         if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 1935                                 t4_aiotx_queue_toep(so, toep);
 1936                         sowwakeup_locked(so);   /* unlocks so_snd */
 1937                 }
 1938                 SOCKBUF_UNLOCK_ASSERT(sb);
 1939         }
 1940 
 1941         INP_WUNLOCK(inp);
 1942 
 1943         return (0);
 1944 }
 1945 
 1946 void
 1947 t4_set_tcb_field(struct adapter *sc, struct sge_wrq *wrq, struct toepcb *toep,
 1948     uint16_t word, uint64_t mask, uint64_t val, int reply, int cookie)
 1949 {
 1950         struct wrqe *wr;
 1951         struct cpl_set_tcb_field *req;
 1952         struct ofld_tx_sdesc *txsd;
 1953 
 1954         MPASS((cookie & ~M_COOKIE) == 0);
 1955         if (reply) {
 1956                 MPASS(cookie != CPL_COOKIE_RESERVED);
 1957         }
 1958 
 1959         wr = alloc_wrqe(sizeof(*req), wrq);
 1960         if (wr == NULL) {
 1961                 /* XXX */
 1962                 panic("%s: allocation failure.", __func__);
 1963         }
 1964         req = wrtod(wr);
 1965 
 1966         INIT_TP_WR_MIT_CPL(req, CPL_SET_TCB_FIELD, toep->tid);
 1967         req->reply_ctrl = htobe16(V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 1968         if (reply == 0)
 1969                 req->reply_ctrl |= htobe16(F_NO_REPLY);
 1970         req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(cookie));
 1971         req->mask = htobe64(mask);
 1972         req->val = htobe64(val);
 1973         if (wrq->eq.type == EQ_OFLD) {
 1974                 txsd = &toep->txsd[toep->txsd_pidx];
 1975                 txsd->tx_credits = howmany(sizeof(*req), 16);
 1976                 txsd->plen = 0;
 1977                 KASSERT(toep->tx_credits >= txsd->tx_credits &&
 1978                     toep->txsd_avail > 0,
 1979                     ("%s: not enough credits (%d)", __func__,
 1980                     toep->tx_credits));
 1981                 toep->tx_credits -= txsd->tx_credits;
 1982                 if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 1983                         toep->txsd_pidx = 0;
 1984                 toep->txsd_avail--;
 1985         }
 1986 
 1987         t4_wrq_tx(sc, wr);
 1988 }
 1989 
 1990 void
 1991 t4_init_cpl_io_handlers(void)
 1992 {
 1993 
 1994         t4_register_cpl_handler(CPL_PEER_CLOSE, do_peer_close);
 1995         t4_register_cpl_handler(CPL_CLOSE_CON_RPL, do_close_con_rpl);
 1996         t4_register_cpl_handler(CPL_ABORT_REQ_RSS, do_abort_req);
 1997         t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, do_abort_rpl,
 1998             CPL_COOKIE_TOM);
 1999         t4_register_cpl_handler(CPL_RX_DATA, do_rx_data);
 2000         t4_register_shared_cpl_handler(CPL_FW4_ACK, do_fw4_ack, CPL_COOKIE_TOM);
 2001 }
 2002 
 2003 void
 2004 t4_uninit_cpl_io_handlers(void)
 2005 {
 2006 
 2007         t4_register_cpl_handler(CPL_PEER_CLOSE, NULL);
 2008         t4_register_cpl_handler(CPL_CLOSE_CON_RPL, NULL);
 2009         t4_register_cpl_handler(CPL_ABORT_REQ_RSS, NULL);
 2010         t4_register_shared_cpl_handler(CPL_ABORT_RPL_RSS, NULL, CPL_COOKIE_TOM);
 2011         t4_register_cpl_handler(CPL_RX_DATA, NULL);
 2012         t4_register_shared_cpl_handler(CPL_FW4_ACK, NULL, CPL_COOKIE_TOM);
 2013 }
 2014 
 2015 /*
 2016  * Use the 'backend1' field in AIO jobs to hold an error that should
 2017  * be reported when the job is completed, the 'backend3' field to
 2018  * store the amount of data sent by the AIO job so far, and the
 2019  * 'backend4' field to hold a reference count on the job.
 2020  *
 2021  * Each unmapped mbuf holds a reference on the job as does the queue
 2022  * so long as the job is queued.
 2023  */
 2024 #define aio_error       backend1
 2025 #define aio_sent        backend3
 2026 #define aio_refs        backend4
 2027 
 2028 #ifdef VERBOSE_TRACES
 2029 static int
 2030 jobtotid(struct kaiocb *job)
 2031 {
 2032         struct socket *so;
 2033         struct tcpcb *tp;
 2034         struct toepcb *toep;
 2035 
 2036         so = job->fd_file->f_data;
 2037         tp = sototcpcb(so);
 2038         toep = tp->t_toe;
 2039         return (toep->tid);
 2040 }
 2041 #endif
 2042 
 2043 static void
 2044 aiotx_free_job(struct kaiocb *job)
 2045 {
 2046         long status;
 2047         int error;
 2048 
 2049         if (refcount_release(&job->aio_refs) == 0)
 2050                 return;
 2051 
 2052         error = (intptr_t)job->aio_error;
 2053         status = job->aio_sent;
 2054 #ifdef VERBOSE_TRACES
 2055         CTR5(KTR_CXGBE, "%s: tid %d completed %p len %ld, error %d", __func__,
 2056             jobtotid(job), job, status, error);
 2057 #endif
 2058         if (error != 0 && status != 0)
 2059                 error = 0;
 2060         if (error == ECANCELED)
 2061                 aio_cancel(job);
 2062         else if (error)
 2063                 aio_complete(job, -1, error);
 2064         else {
 2065                 job->msgsnd = 1;
 2066                 aio_complete(job, status, 0);
 2067         }
 2068 }
 2069 
 2070 static void
 2071 aiotx_free_pgs(struct mbuf *m)
 2072 {
 2073         struct kaiocb *job;
 2074         vm_page_t pg;
 2075 
 2076         M_ASSERTEXTPG(m);
 2077         job = m->m_ext.ext_arg1;
 2078 #ifdef VERBOSE_TRACES
 2079         CTR3(KTR_CXGBE, "%s: completed %d bytes for tid %d", __func__,
 2080             m->m_len, jobtotid(job));
 2081 #endif
 2082 
 2083         for (int i = 0; i < m->m_epg_npgs; i++) {
 2084                 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 2085                 vm_page_unwire(pg, PQ_ACTIVE);
 2086         }
 2087 
 2088         aiotx_free_job(job);
 2089 }
 2090 
 2091 /*
 2092  * Allocate a chain of unmapped mbufs describing the next 'len' bytes
 2093  * of an AIO job.
 2094  */
 2095 static struct mbuf *
 2096 alloc_aiotx_mbuf(struct kaiocb *job, int len)
 2097 {
 2098         struct vmspace *vm;
 2099         vm_page_t pgs[MBUF_PEXT_MAX_PGS];
 2100         struct mbuf *m, *top, *last;
 2101         vm_map_t map;
 2102         vm_offset_t start;
 2103         int i, mlen, npages, pgoff;
 2104 
 2105         KASSERT(job->aio_sent + len <= job->uaiocb.aio_nbytes,
 2106             ("%s(%p, %d): request to send beyond end of buffer", __func__,
 2107             job, len));
 2108 
 2109         /*
 2110          * The AIO subsystem will cancel and drain all requests before
 2111          * permitting a process to exit or exec, so p_vmspace should
 2112          * be stable here.
 2113          */
 2114         vm = job->userproc->p_vmspace;
 2115         map = &vm->vm_map;
 2116         start = (uintptr_t)job->uaiocb.aio_buf + job->aio_sent;
 2117         pgoff = start & PAGE_MASK;
 2118 
 2119         top = NULL;
 2120         last = NULL;
 2121         while (len > 0) {
 2122                 mlen = imin(len, MBUF_PEXT_MAX_PGS * PAGE_SIZE - pgoff);
 2123                 KASSERT(mlen == len || ((start + mlen) & PAGE_MASK) == 0,
 2124                     ("%s: next start (%#jx + %#x) is not page aligned",
 2125                     __func__, (uintmax_t)start, mlen));
 2126 
 2127                 npages = vm_fault_quick_hold_pages(map, start, mlen,
 2128                     VM_PROT_WRITE, pgs, nitems(pgs));
 2129                 if (npages < 0)
 2130                         break;
 2131 
 2132                 m = mb_alloc_ext_pgs(M_WAITOK, aiotx_free_pgs);
 2133                 if (m == NULL) {
 2134                         vm_page_unhold_pages(pgs, npages);
 2135                         break;
 2136                 }
 2137 
 2138                 m->m_epg_1st_off = pgoff;
 2139                 m->m_epg_npgs = npages;
 2140                 if (npages == 1) {
 2141                         KASSERT(mlen + pgoff <= PAGE_SIZE,
 2142                             ("%s: single page is too large (off %d len %d)",
 2143                             __func__, pgoff, mlen));
 2144                         m->m_epg_last_len = mlen;
 2145                 } else {
 2146                         m->m_epg_last_len = mlen - (PAGE_SIZE - pgoff) -
 2147                             (npages - 2) * PAGE_SIZE;
 2148                 }
 2149                 for (i = 0; i < npages; i++)
 2150                         m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pgs[i]);
 2151 
 2152                 m->m_len = mlen;
 2153                 m->m_ext.ext_size = npages * PAGE_SIZE;
 2154                 m->m_ext.ext_arg1 = job;
 2155                 refcount_acquire(&job->aio_refs);
 2156 
 2157 #ifdef VERBOSE_TRACES
 2158                 CTR5(KTR_CXGBE, "%s: tid %d, new mbuf %p for job %p, npages %d",
 2159                     __func__, jobtotid(job), m, job, npages);
 2160 #endif
 2161 
 2162                 if (top == NULL)
 2163                         top = m;
 2164                 else
 2165                         last->m_next = m;
 2166                 last = m;
 2167 
 2168                 len -= mlen;
 2169                 start += mlen;
 2170                 pgoff = 0;
 2171         }
 2172 
 2173         return (top);
 2174 }
 2175 
 2176 static void
 2177 t4_aiotx_process_job(struct toepcb *toep, struct socket *so, struct kaiocb *job)
 2178 {
 2179         struct sockbuf *sb;
 2180         struct inpcb *inp;
 2181         struct tcpcb *tp;
 2182         struct mbuf *m;
 2183         int error, len;
 2184         bool moretocome, sendmore;
 2185 
 2186         sb = &so->so_snd;
 2187         SOCKBUF_UNLOCK(sb);
 2188         m = NULL;
 2189 
 2190 #ifdef MAC
 2191         error = mac_socket_check_send(job->fd_file->f_cred, so);
 2192         if (error != 0)
 2193                 goto out;
 2194 #endif
 2195 
 2196         /* Inline sosend_generic(). */
 2197 
 2198         error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 2199         MPASS(error == 0);
 2200 
 2201 sendanother:
 2202         SOCKBUF_LOCK(sb);
 2203         if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 2204                 SOCKBUF_UNLOCK(sb);
 2205                 SOCK_IO_SEND_UNLOCK(so);
 2206                 if ((so->so_options & SO_NOSIGPIPE) == 0) {
 2207                         PROC_LOCK(job->userproc);
 2208                         kern_psignal(job->userproc, SIGPIPE);
 2209                         PROC_UNLOCK(job->userproc);
 2210                 }
 2211                 error = EPIPE;
 2212                 goto out;
 2213         }
 2214         if (so->so_error) {
 2215                 error = so->so_error;
 2216                 so->so_error = 0;
 2217                 SOCKBUF_UNLOCK(sb);
 2218                 SOCK_IO_SEND_UNLOCK(so);
 2219                 goto out;
 2220         }
 2221         if ((so->so_state & SS_ISCONNECTED) == 0) {
 2222                 SOCKBUF_UNLOCK(sb);
 2223                 SOCK_IO_SEND_UNLOCK(so);
 2224                 error = ENOTCONN;
 2225                 goto out;
 2226         }
 2227         if (sbspace(sb) < sb->sb_lowat) {
 2228                 MPASS(job->aio_sent == 0 || !(so->so_state & SS_NBIO));
 2229 
 2230                 /*
 2231                  * Don't block if there is too little room in the socket
 2232                  * buffer.  Instead, requeue the request.
 2233                  */
 2234                 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 2235                         SOCKBUF_UNLOCK(sb);
 2236                         SOCK_IO_SEND_UNLOCK(so);
 2237                         error = ECANCELED;
 2238                         goto out;
 2239                 }
 2240                 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 2241                 SOCKBUF_UNLOCK(sb);
 2242                 SOCK_IO_SEND_UNLOCK(so);
 2243                 goto out;
 2244         }
 2245 
 2246         /*
 2247          * Write as much data as the socket permits, but no more than a
 2248          * a single sndbuf at a time.
 2249          */
 2250         len = sbspace(sb);
 2251         if (len > job->uaiocb.aio_nbytes - job->aio_sent) {
 2252                 len = job->uaiocb.aio_nbytes - job->aio_sent;
 2253                 moretocome = false;
 2254         } else
 2255                 moretocome = true;
 2256         if (len > toep->params.sndbuf) {
 2257                 len = toep->params.sndbuf;
 2258                 sendmore = true;
 2259         } else
 2260                 sendmore = false;
 2261 
 2262         if (!TAILQ_EMPTY(&toep->aiotx_jobq))
 2263                 moretocome = true;
 2264         SOCKBUF_UNLOCK(sb);
 2265         MPASS(len != 0);
 2266 
 2267         m = alloc_aiotx_mbuf(job, len);
 2268         if (m == NULL) {
 2269                 SOCK_IO_SEND_UNLOCK(so);
 2270                 error = EFAULT;
 2271                 goto out;
 2272         }
 2273 
 2274         /* Inlined tcp_usr_send(). */
 2275 
 2276         inp = toep->inp;
 2277         INP_WLOCK(inp);
 2278         if (inp->inp_flags & INP_DROPPED) {
 2279                 INP_WUNLOCK(inp);
 2280                 SOCK_IO_SEND_UNLOCK(so);
 2281                 error = ECONNRESET;
 2282                 goto out;
 2283         }
 2284 
 2285         job->aio_sent += m_length(m, NULL);
 2286 
 2287         sbappendstream(sb, m, 0);
 2288         m = NULL;
 2289 
 2290         if (!(inp->inp_flags & INP_DROPPED)) {
 2291                 tp = intotcpcb(inp);
 2292                 if (moretocome)
 2293                         tp->t_flags |= TF_MORETOCOME;
 2294                 error = tcp_output(tp);
 2295                 if (error < 0) {
 2296                         INP_UNLOCK_ASSERT(inp);
 2297                         SOCK_IO_SEND_UNLOCK(so);
 2298                         error = -error;
 2299                         goto out;
 2300                 }
 2301                 if (moretocome)
 2302                         tp->t_flags &= ~TF_MORETOCOME;
 2303         }
 2304 
 2305         INP_WUNLOCK(inp);
 2306         if (sendmore)
 2307                 goto sendanother;
 2308         SOCK_IO_SEND_UNLOCK(so);
 2309 
 2310         if (error)
 2311                 goto out;
 2312 
 2313         /*
 2314          * If this is a blocking socket and the request has not been
 2315          * fully completed, requeue it until the socket is ready
 2316          * again.
 2317          */
 2318         if (job->aio_sent < job->uaiocb.aio_nbytes &&
 2319             !(so->so_state & SS_NBIO)) {
 2320                 SOCKBUF_LOCK(sb);
 2321                 if (!aio_set_cancel_function(job, t4_aiotx_cancel)) {
 2322                         SOCKBUF_UNLOCK(sb);
 2323                         error = ECANCELED;
 2324                         goto out;
 2325                 }
 2326                 TAILQ_INSERT_HEAD(&toep->aiotx_jobq, job, list);
 2327                 return;
 2328         }
 2329 
 2330         /*
 2331          * If the request will not be requeued, drop the queue's
 2332          * reference to the job.  Any mbufs in flight should still
 2333          * hold a reference, but this drops the reference that the
 2334          * queue owns while it is waiting to queue mbufs to the
 2335          * socket.
 2336          */
 2337         aiotx_free_job(job);
 2338 
 2339 out:
 2340         if (error) {
 2341                 job->aio_error = (void *)(intptr_t)error;
 2342                 aiotx_free_job(job);
 2343         }
 2344         m_freem(m);
 2345         SOCKBUF_LOCK(sb);
 2346 }
 2347 
 2348 static void
 2349 t4_aiotx_task(void *context, int pending)
 2350 {
 2351         struct toepcb *toep = context;
 2352         struct socket *so;
 2353         struct kaiocb *job;
 2354         struct epoch_tracker et;
 2355 
 2356         so = toep->aiotx_so;
 2357         CURVNET_SET(toep->vnet);
 2358         NET_EPOCH_ENTER(et);
 2359         SOCKBUF_LOCK(&so->so_snd);
 2360         while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) {
 2361                 job = TAILQ_FIRST(&toep->aiotx_jobq);
 2362                 TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 2363                 if (!aio_clear_cancel_function(job))
 2364                         continue;
 2365 
 2366                 t4_aiotx_process_job(toep, so, job);
 2367         }
 2368         toep->aiotx_so = NULL;
 2369         SOCKBUF_UNLOCK(&so->so_snd);
 2370         NET_EPOCH_EXIT(et);
 2371 
 2372         free_toepcb(toep);
 2373         sorele(so);
 2374         CURVNET_RESTORE();
 2375 }
 2376 
 2377 static void
 2378 t4_aiotx_queue_toep(struct socket *so, struct toepcb *toep)
 2379 {
 2380 
 2381         SOCKBUF_LOCK_ASSERT(&toep->inp->inp_socket->so_snd);
 2382 #ifdef VERBOSE_TRACES
 2383         CTR3(KTR_CXGBE, "%s: queueing aiotx task for tid %d, active = %s",
 2384             __func__, toep->tid, toep->aiotx_so != NULL ? "true" : "false");
 2385 #endif
 2386         if (toep->aiotx_so != NULL)
 2387                 return;
 2388         soref(so);
 2389         toep->aiotx_so = so;
 2390         hold_toepcb(toep);
 2391         soaio_enqueue(&toep->aiotx_task);
 2392 }
 2393 
 2394 static void
 2395 t4_aiotx_cancel(struct kaiocb *job)
 2396 {
 2397         struct socket *so;
 2398         struct sockbuf *sb;
 2399         struct tcpcb *tp;
 2400         struct toepcb *toep;
 2401 
 2402         so = job->fd_file->f_data;
 2403         tp = sototcpcb(so);
 2404         toep = tp->t_toe;
 2405         MPASS(job->uaiocb.aio_lio_opcode == LIO_WRITE);
 2406         sb = &so->so_snd;
 2407 
 2408         SOCKBUF_LOCK(sb);
 2409         if (!aio_cancel_cleared(job))
 2410                 TAILQ_REMOVE(&toep->aiotx_jobq, job, list);
 2411         SOCKBUF_UNLOCK(sb);
 2412 
 2413         job->aio_error = (void *)(intptr_t)ECANCELED;
 2414         aiotx_free_job(job);
 2415 }
 2416 
 2417 int
 2418 t4_aio_queue_aiotx(struct socket *so, struct kaiocb *job)
 2419 {
 2420         struct tcpcb *tp = sototcpcb(so);
 2421         struct toepcb *toep = tp->t_toe;
 2422         struct adapter *sc = td_adapter(toep->td);
 2423 
 2424         /* This only handles writes. */
 2425         if (job->uaiocb.aio_lio_opcode != LIO_WRITE)
 2426                 return (EOPNOTSUPP);
 2427 
 2428         if (!sc->tt.tx_zcopy)
 2429                 return (EOPNOTSUPP);
 2430 
 2431         if (tls_tx_key(toep))
 2432                 return (EOPNOTSUPP);
 2433 
 2434         SOCKBUF_LOCK(&so->so_snd);
 2435 #ifdef VERBOSE_TRACES
 2436         CTR3(KTR_CXGBE, "%s: queueing %p for tid %u", __func__, job, toep->tid);
 2437 #endif
 2438         if (!aio_set_cancel_function(job, t4_aiotx_cancel))
 2439                 panic("new job was cancelled");
 2440         refcount_init(&job->aio_refs, 1);
 2441         TAILQ_INSERT_TAIL(&toep->aiotx_jobq, job, list);
 2442         if (sowriteable(so))
 2443                 t4_aiotx_queue_toep(so, toep);
 2444         SOCKBUF_UNLOCK(&so->so_snd);
 2445         return (0);
 2446 }
 2447 
 2448 void
 2449 aiotx_init_toep(struct toepcb *toep)
 2450 {
 2451 
 2452         TAILQ_INIT(&toep->aiotx_jobq);
 2453         TASK_INIT(&toep->aiotx_task, 0, t4_aiotx_task, toep);
 2454 }
 2455 #endif

Cache object: 33230085562e099040602aacdb3f9962


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.