The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/cxgbe/tom/t4_listen.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2012 Chelsio Communications, Inc.
    5  * All rights reserved.
    6  * Written by: Navdeep Parhar <np@FreeBSD.org>
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD$");
   32 
   33 #include "opt_inet.h"
   34 #include "opt_inet6.h"
   35 
   36 #ifdef TCP_OFFLOAD
   37 #include <sys/param.h>
   38 #include <sys/types.h>
   39 #include <sys/kernel.h>
   40 #include <sys/ktr.h>
   41 #include <sys/module.h>
   42 #include <sys/protosw.h>
   43 #include <sys/refcount.h>
   44 #include <sys/domain.h>
   45 #include <sys/fnv_hash.h>
   46 #include <sys/socket.h>
   47 #include <sys/socketvar.h>
   48 #include <sys/sysctl.h>
   49 #include <net/ethernet.h>
   50 #include <net/if.h>
   51 #include <net/if_types.h>
   52 #include <net/if_vlan_var.h>
   53 #include <net/route.h>
   54 #include <net/route/nhop.h>
   55 #include <netinet/in.h>
   56 #include <netinet/in_fib.h>
   57 #include <netinet/in_pcb.h>
   58 #include <netinet/ip.h>
   59 #include <netinet/ip6.h>
   60 #include <netinet6/in6_fib.h>
   61 #include <netinet6/scope6_var.h>
   62 #include <netinet/tcp_timer.h>
   63 #define TCPSTATES
   64 #include <netinet/tcp_fsm.h>
   65 #include <netinet/tcp_var.h>
   66 #include <netinet/toecore.h>
   67 #include <netinet/cc/cc.h>
   68 
   69 #include "common/common.h"
   70 #include "common/t4_msg.h"
   71 #include "common/t4_regs.h"
   72 #include "t4_clip.h"
   73 #include "tom/t4_tom_l2t.h"
   74 #include "tom/t4_tom.h"
   75 
   76 /* stid services */
   77 static int alloc_stid(struct adapter *, struct listen_ctx *, int);
   78 static struct listen_ctx *lookup_stid(struct adapter *, int);
   79 static void free_stid(struct adapter *, struct listen_ctx *);
   80 
   81 /* lctx services */
   82 static struct listen_ctx *alloc_lctx(struct adapter *, struct inpcb *,
   83     struct vi_info *);
   84 static int free_lctx(struct adapter *, struct listen_ctx *);
   85 static void hold_lctx(struct listen_ctx *);
   86 static void listen_hash_add(struct adapter *, struct listen_ctx *);
   87 static struct listen_ctx *listen_hash_find(struct adapter *, struct inpcb *);
   88 static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *);
   89 static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *);
   90 
   91 static void send_abort_rpl_synqe(struct toedev *, struct synq_entry *, int);
   92 
   93 static int
   94 alloc_stid(struct adapter *sc, struct listen_ctx *lctx, int isipv6)
   95 {
   96         struct tid_info *t = &sc->tids;
   97         u_int stid, n, f, mask;
   98         struct stid_region *sr = &lctx->stid_region;
   99 
  100         /*
  101          * An IPv6 server needs 2 naturally aligned stids (1 stid = 4 cells) in
  102          * the TCAM.  The start of the stid region is properly aligned (the chip
  103          * requires each region to be 128-cell aligned).
  104          */
  105         n = isipv6 ? 2 : 1;
  106         mask = n - 1;
  107         KASSERT((t->stid_base & mask) == 0 && (t->nstids & mask) == 0,
  108             ("%s: stid region (%u, %u) not properly aligned.  n = %u",
  109             __func__, t->stid_base, t->nstids, n));
  110 
  111         mtx_lock(&t->stid_lock);
  112         if (n > t->nstids - t->stids_in_use) {
  113                 mtx_unlock(&t->stid_lock);
  114                 return (-1);
  115         }
  116 
  117         if (t->nstids_free_head >= n) {
  118                 /*
  119                  * This allocation will definitely succeed because the region
  120                  * starts at a good alignment and we just checked we have enough
  121                  * stids free.
  122                  */
  123                 f = t->nstids_free_head & mask;
  124                 t->nstids_free_head -= n + f;
  125                 stid = t->nstids_free_head;
  126                 TAILQ_INSERT_HEAD(&t->stids, sr, link);
  127         } else {
  128                 struct stid_region *s;
  129 
  130                 stid = t->nstids_free_head;
  131                 TAILQ_FOREACH(s, &t->stids, link) {
  132                         stid += s->used + s->free;
  133                         f = stid & mask;
  134                         if (s->free >= n + f) {
  135                                 stid -= n + f;
  136                                 s->free -= n + f;
  137                                 TAILQ_INSERT_AFTER(&t->stids, s, sr, link);
  138                                 goto allocated;
  139                         }
  140                 }
  141 
  142                 if (__predict_false(stid != t->nstids)) {
  143                         panic("%s: stids TAILQ (%p) corrupt."
  144                             "  At %d instead of %d at the end of the queue.",
  145                             __func__, &t->stids, stid, t->nstids);
  146                 }
  147 
  148                 mtx_unlock(&t->stid_lock);
  149                 return (-1);
  150         }
  151 
  152 allocated:
  153         sr->used = n;
  154         sr->free = f;
  155         t->stids_in_use += n;
  156         t->stid_tab[stid] = lctx;
  157         mtx_unlock(&t->stid_lock);
  158 
  159         KASSERT(((stid + t->stid_base) & mask) == 0,
  160             ("%s: EDOOFUS.", __func__));
  161         return (stid + t->stid_base);
  162 }
  163 
  164 static struct listen_ctx *
  165 lookup_stid(struct adapter *sc, int stid)
  166 {
  167         struct tid_info *t = &sc->tids;
  168 
  169         return (t->stid_tab[stid - t->stid_base]);
  170 }
  171 
  172 static void
  173 free_stid(struct adapter *sc, struct listen_ctx *lctx)
  174 {
  175         struct tid_info *t = &sc->tids;
  176         struct stid_region *sr = &lctx->stid_region;
  177         struct stid_region *s;
  178 
  179         KASSERT(sr->used > 0, ("%s: nonsense free (%d)", __func__, sr->used));
  180 
  181         mtx_lock(&t->stid_lock);
  182         s = TAILQ_PREV(sr, stid_head, link);
  183         if (s != NULL)
  184                 s->free += sr->used + sr->free;
  185         else
  186                 t->nstids_free_head += sr->used + sr->free;
  187         KASSERT(t->stids_in_use >= sr->used,
  188             ("%s: stids_in_use (%u) < stids being freed (%u)", __func__,
  189             t->stids_in_use, sr->used));
  190         t->stids_in_use -= sr->used;
  191         TAILQ_REMOVE(&t->stids, sr, link);
  192         mtx_unlock(&t->stid_lock);
  193 }
  194 
  195 static struct listen_ctx *
  196 alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi)
  197 {
  198         struct listen_ctx *lctx;
  199 
  200         INP_WLOCK_ASSERT(inp);
  201 
  202         lctx = malloc(sizeof(struct listen_ctx), M_CXGBE, M_NOWAIT | M_ZERO);
  203         if (lctx == NULL)
  204                 return (NULL);
  205 
  206         lctx->stid = alloc_stid(sc, lctx, inp->inp_vflag & INP_IPV6);
  207         if (lctx->stid < 0) {
  208                 free(lctx, M_CXGBE);
  209                 return (NULL);
  210         }
  211 
  212         if (inp->inp_vflag & INP_IPV6 &&
  213             !IN6_ARE_ADDR_EQUAL(&in6addr_any, &inp->in6p_laddr)) {
  214                 lctx->ce = t4_get_clip_entry(sc, &inp->in6p_laddr, true);
  215                 if (lctx->ce == NULL) {
  216                         free(lctx, M_CXGBE);
  217                         return (NULL);
  218                 }
  219         }
  220 
  221         lctx->ctrlq = &sc->sge.ctrlq[vi->pi->port_id];
  222         lctx->ofld_rxq = &sc->sge.ofld_rxq[vi->first_ofld_rxq];
  223         refcount_init(&lctx->refcount, 1);
  224 
  225         lctx->inp = inp;
  226         lctx->vnet = inp->inp_socket->so_vnet;
  227         in_pcbref(inp);
  228 
  229         return (lctx);
  230 }
  231 
  232 /* Don't call this directly, use release_lctx instead */
  233 static int
  234 free_lctx(struct adapter *sc, struct listen_ctx *lctx)
  235 {
  236         struct inpcb *inp = lctx->inp;
  237 
  238         INP_WLOCK_ASSERT(inp);
  239         KASSERT(lctx->refcount == 0,
  240             ("%s: refcount %d", __func__, lctx->refcount));
  241         KASSERT(lctx->stid >= 0, ("%s: bad stid %d.", __func__, lctx->stid));
  242 
  243         CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, inp %p",
  244             __func__, lctx->stid, lctx, lctx->inp);
  245 
  246         if (lctx->ce)
  247                 t4_release_clip_entry(sc, lctx->ce);
  248         free_stid(sc, lctx);
  249         free(lctx, M_CXGBE);
  250 
  251         return (in_pcbrele_wlocked(inp));
  252 }
  253 
  254 static void
  255 hold_lctx(struct listen_ctx *lctx)
  256 {
  257 
  258         refcount_acquire(&lctx->refcount);
  259 }
  260 
  261 static inline uint32_t
  262 listen_hashfn(void *key, u_long mask)
  263 {
  264 
  265         return (fnv_32_buf(&key, sizeof(key), FNV1_32_INIT) & mask);
  266 }
  267 
  268 /*
  269  * Add a listen_ctx entry to the listen hash table.
  270  */
  271 static void
  272 listen_hash_add(struct adapter *sc, struct listen_ctx *lctx)
  273 {
  274         struct tom_data *td = sc->tom_softc;
  275         int bucket = listen_hashfn(lctx->inp, td->listen_mask);
  276 
  277         mtx_lock(&td->lctx_hash_lock);
  278         LIST_INSERT_HEAD(&td->listen_hash[bucket], lctx, link);
  279         td->lctx_count++;
  280         mtx_unlock(&td->lctx_hash_lock);
  281 }
  282 
  283 /*
  284  * Look for the listening socket's context entry in the hash and return it.
  285  */
  286 static struct listen_ctx *
  287 listen_hash_find(struct adapter *sc, struct inpcb *inp)
  288 {
  289         struct tom_data *td = sc->tom_softc;
  290         int bucket = listen_hashfn(inp, td->listen_mask);
  291         struct listen_ctx *lctx;
  292 
  293         mtx_lock(&td->lctx_hash_lock);
  294         LIST_FOREACH(lctx, &td->listen_hash[bucket], link) {
  295                 if (lctx->inp == inp)
  296                         break;
  297         }
  298         mtx_unlock(&td->lctx_hash_lock);
  299 
  300         return (lctx);
  301 }
  302 
  303 /*
  304  * Removes the listen_ctx structure for inp from the hash and returns it.
  305  */
  306 static struct listen_ctx *
  307 listen_hash_del(struct adapter *sc, struct inpcb *inp)
  308 {
  309         struct tom_data *td = sc->tom_softc;
  310         int bucket = listen_hashfn(inp, td->listen_mask);
  311         struct listen_ctx *lctx, *l;
  312 
  313         mtx_lock(&td->lctx_hash_lock);
  314         LIST_FOREACH_SAFE(lctx, &td->listen_hash[bucket], link, l) {
  315                 if (lctx->inp == inp) {
  316                         LIST_REMOVE(lctx, link);
  317                         td->lctx_count--;
  318                         break;
  319                 }
  320         }
  321         mtx_unlock(&td->lctx_hash_lock);
  322 
  323         return (lctx);
  324 }
  325 
  326 /*
  327  * Releases a hold on the lctx.  Must be called with the listening socket's inp
  328  * locked.  The inp may be freed by this function and it returns NULL to
  329  * indicate this.
  330  */
  331 static struct inpcb *
  332 release_lctx(struct adapter *sc, struct listen_ctx *lctx)
  333 {
  334         struct inpcb *inp = lctx->inp;
  335         int inp_freed = 0;
  336 
  337         INP_WLOCK_ASSERT(inp);
  338         if (refcount_release(&lctx->refcount))
  339                 inp_freed = free_lctx(sc, lctx);
  340 
  341         return (inp_freed ? NULL : inp);
  342 }
  343 
  344 static void
  345 send_flowc_wr_synqe(struct adapter *sc, struct synq_entry *synqe)
  346 {
  347         struct mbuf *m = synqe->syn;
  348         struct ifnet *ifp = m->m_pkthdr.rcvif;
  349         struct vi_info *vi = ifp->if_softc;
  350         struct port_info *pi = vi->pi;
  351         struct wrqe *wr;
  352         struct fw_flowc_wr *flowc;
  353         struct sge_ofld_txq *ofld_txq;
  354         struct sge_ofld_rxq *ofld_rxq;
  355         const int nparams = 6;
  356         const int flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval);
  357         const u_int pfvf = sc->pf << S_FW_VIID_PFN;
  358 
  359         INP_WLOCK_ASSERT(synqe->lctx->inp);
  360         MPASS((synqe->flags & TPF_FLOWC_WR_SENT) == 0);
  361 
  362         ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
  363         ofld_rxq = &sc->sge.ofld_rxq[synqe->params.rxq_idx];
  364 
  365         wr = alloc_wrqe(roundup2(flowclen, 16), &ofld_txq->wrq);
  366         if (wr == NULL) {
  367                 /* XXX */
  368                 panic("%s: allocation failure.", __func__);
  369         }
  370         flowc = wrtod(wr);
  371         memset(flowc, 0, wr->wr_len);
  372         flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
  373             V_FW_FLOWC_WR_NPARAMS(nparams));
  374         flowc->flowid_len16 = htonl(V_FW_WR_LEN16(howmany(flowclen, 16)) |
  375             V_FW_WR_FLOWID(synqe->tid));
  376         flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
  377         flowc->mnemval[0].val = htobe32(pfvf);
  378         flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
  379         flowc->mnemval[1].val = htobe32(pi->tx_chan);
  380         flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
  381         flowc->mnemval[2].val = htobe32(pi->tx_chan);
  382         flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
  383         flowc->mnemval[3].val = htobe32(ofld_rxq->iq.abs_id);
  384         flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDBUF;
  385         flowc->mnemval[4].val = htobe32(512);
  386         flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_MSS;
  387         flowc->mnemval[5].val = htobe32(512);
  388 
  389         synqe->flags |= TPF_FLOWC_WR_SENT;
  390         t4_wrq_tx(sc, wr);
  391 }
  392 
  393 static void
  394 send_abort_rpl_synqe(struct toedev *tod, struct synq_entry *synqe,
  395     int rst_status)
  396 {
  397         struct adapter *sc = tod->tod_softc;
  398         struct wrqe *wr;
  399         struct cpl_abort_req *req;
  400 
  401         INP_WLOCK_ASSERT(synqe->lctx->inp);
  402 
  403         CTR5(KTR_CXGBE, "%s: synqe %p (0x%x), tid %d%s",
  404             __func__, synqe, synqe->flags, synqe->tid,
  405             synqe->flags & TPF_ABORT_SHUTDOWN ?
  406             " (abort already in progress)" : "");
  407         if (synqe->flags & TPF_ABORT_SHUTDOWN)
  408                 return; /* abort already in progress */
  409         synqe->flags |= TPF_ABORT_SHUTDOWN;
  410 
  411         if (!(synqe->flags & TPF_FLOWC_WR_SENT))
  412                 send_flowc_wr_synqe(sc, synqe);
  413 
  414         wr = alloc_wrqe(sizeof(*req),
  415             &sc->sge.ofld_txq[synqe->params.txq_idx].wrq);
  416         if (wr == NULL) {
  417                 /* XXX */
  418                 panic("%s: allocation failure.", __func__);
  419         }
  420         req = wrtod(wr);
  421         INIT_TP_WR_MIT_CPL(req, CPL_ABORT_REQ, synqe->tid);
  422         req->rsvd0 = 0; /* don't have a snd_nxt */
  423         req->rsvd1 = 1; /* no data sent yet */
  424         req->cmd = rst_status;
  425 
  426         t4_l2t_send(sc, wr, &sc->l2t->l2tab[synqe->params.l2t_idx]);
  427 }
  428 
  429 static int
  430 create_server(struct adapter *sc, struct listen_ctx *lctx)
  431 {
  432         struct wrqe *wr;
  433         struct cpl_pass_open_req *req;
  434         struct inpcb *inp = lctx->inp;
  435 
  436         wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
  437         if (wr == NULL) {
  438                 log(LOG_ERR, "%s: allocation failure", __func__);
  439                 return (ENOMEM);
  440         }
  441         req = wrtod(wr);
  442 
  443         INIT_TP_WR(req, 0);
  444         OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, lctx->stid));
  445         req->local_port = inp->inp_lport;
  446         req->peer_port = 0;
  447         req->local_ip = inp->inp_laddr.s_addr;
  448         req->peer_ip = 0;
  449         req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
  450         req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
  451             F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
  452 
  453         t4_wrq_tx(sc, wr);
  454         return (0);
  455 }
  456 
  457 static int
  458 create_server6(struct adapter *sc, struct listen_ctx *lctx)
  459 {
  460         struct wrqe *wr;
  461         struct cpl_pass_open_req6 *req;
  462         struct inpcb *inp = lctx->inp;
  463 
  464         wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
  465         if (wr == NULL) {
  466                 log(LOG_ERR, "%s: allocation failure", __func__);
  467                 return (ENOMEM);
  468         }
  469         req = wrtod(wr);
  470 
  471         INIT_TP_WR(req, 0);
  472         OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_PASS_OPEN_REQ6, lctx->stid));
  473         req->local_port = inp->inp_lport;
  474         req->peer_port = 0;
  475         req->local_ip_hi = *(uint64_t *)&inp->in6p_laddr.s6_addr[0];
  476         req->local_ip_lo = *(uint64_t *)&inp->in6p_laddr.s6_addr[8];
  477         req->peer_ip_hi = 0;
  478         req->peer_ip_lo = 0;
  479         req->opt0 = htobe64(V_TX_CHAN(lctx->ctrlq->eq.tx_chan));
  480         req->opt1 = htobe64(V_CONN_POLICY(CPL_CONN_POLICY_ASK) |
  481             F_SYN_RSS_ENABLE | V_SYN_RSS_QUEUE(lctx->ofld_rxq->iq.abs_id));
  482 
  483         t4_wrq_tx(sc, wr);
  484         return (0);
  485 }
  486 
  487 static int
  488 destroy_server(struct adapter *sc, struct listen_ctx *lctx)
  489 {
  490         struct wrqe *wr;
  491         struct cpl_close_listsvr_req *req;
  492 
  493         wr = alloc_wrqe(sizeof(*req), lctx->ctrlq);
  494         if (wr == NULL) {
  495                 /* XXX */
  496                 panic("%s: allocation failure.", __func__);
  497         }
  498         req = wrtod(wr);
  499 
  500         INIT_TP_WR(req, 0);
  501         OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ,
  502             lctx->stid));
  503         req->reply_ctrl = htobe16(lctx->ofld_rxq->iq.abs_id);
  504         req->rsvd = htobe16(0);
  505 
  506         t4_wrq_tx(sc, wr);
  507         return (0);
  508 }
  509 
  510 /*
  511  * Start a listening server by sending a passive open request to HW.
  512  *
  513  * Can't take adapter lock here and access to sc->flags,
  514  * sc->offload_map, if_capenable are all race prone.
  515  */
  516 int
  517 t4_listen_start(struct toedev *tod, struct tcpcb *tp)
  518 {
  519         struct adapter *sc = tod->tod_softc;
  520         struct vi_info *vi;
  521         struct port_info *pi;
  522         struct inpcb *inp = tptoinpcb(tp);
  523         struct listen_ctx *lctx;
  524         int i, rc, v;
  525         struct offload_settings settings;
  526 
  527         INP_WLOCK_ASSERT(inp);
  528 
  529         rw_rlock(&sc->policy_lock);
  530         settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL,
  531             EVL_MAKETAG(0xfff, 0, 0), inp);
  532         rw_runlock(&sc->policy_lock);
  533         if (!settings.offload)
  534                 return (0);
  535 
  536         /* Don't start a hardware listener for any loopback address. */
  537         if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr))
  538                 return (0);
  539         if (!(inp->inp_vflag & INP_IPV6) &&
  540             IN_LOOPBACK(ntohl(inp->inp_laddr.s_addr)))
  541                 return (0);
  542         if (sc->flags & KERN_TLS_ON)
  543                 return (0);
  544 #if 0
  545         ADAPTER_LOCK(sc);
  546         if (IS_BUSY(sc)) {
  547                 log(LOG_ERR, "%s: listen request ignored, %s is busy",
  548                     __func__, device_get_nameunit(sc->dev));
  549                 goto done;
  550         }
  551 
  552         KASSERT(uld_active(sc, ULD_TOM),
  553             ("%s: TOM not initialized", __func__));
  554 #endif
  555 
  556         /*
  557          * Find an initialized VI with IFCAP_TOE (4 or 6).  We'll use the first
  558          * such VI's queues to send the passive open and receive the reply to
  559          * it.
  560          *
  561          * XXX: need a way to mark a port in use by offload.  if_cxgbe should
  562          * then reject any attempt to bring down such a port (and maybe reject
  563          * attempts to disable IFCAP_TOE on that port too?).
  564          */
  565         for_each_port(sc, i) {
  566                 pi = sc->port[i];
  567                 for_each_vi(pi, v, vi) {
  568                         if (vi->flags & VI_INIT_DONE &&
  569                             vi->ifp->if_capenable & IFCAP_TOE)
  570                                 goto found;
  571                 }
  572         }
  573         goto done;      /* no port that's UP with IFCAP_TOE enabled */
  574 found:
  575 
  576         if (listen_hash_find(sc, inp) != NULL)
  577                 goto done;      /* already setup */
  578 
  579         lctx = alloc_lctx(sc, inp, vi);
  580         if (lctx == NULL) {
  581                 log(LOG_ERR,
  582                     "%s: listen request ignored, %s couldn't allocate lctx\n",
  583                     __func__, device_get_nameunit(sc->dev));
  584                 goto done;
  585         }
  586         listen_hash_add(sc, lctx);
  587 
  588         CTR6(KTR_CXGBE, "%s: stid %u (%s), lctx %p, inp %p vflag 0x%x",
  589             __func__, lctx->stid, tcpstates[tp->t_state], lctx, inp,
  590             inp->inp_vflag);
  591 
  592         if (inp->inp_vflag & INP_IPV6)
  593                 rc = create_server6(sc, lctx);
  594         else
  595                 rc = create_server(sc, lctx);
  596         if (rc != 0) {
  597                 log(LOG_ERR, "%s: %s failed to create hw listener: %d.\n",
  598                     __func__, device_get_nameunit(sc->dev), rc);
  599                 (void) listen_hash_del(sc, inp);
  600                 inp = release_lctx(sc, lctx);
  601                 /* can't be freed, host stack has a reference */
  602                 KASSERT(inp != NULL, ("%s: inp freed", __func__));
  603                 goto done;
  604         }
  605         lctx->flags |= LCTX_RPL_PENDING;
  606 done:
  607 #if 0
  608         ADAPTER_UNLOCK(sc);
  609 #endif
  610         return (0);
  611 }
  612 
  613 int
  614 t4_listen_stop(struct toedev *tod, struct tcpcb *tp)
  615 {
  616         struct listen_ctx *lctx;
  617         struct adapter *sc = tod->tod_softc;
  618         struct inpcb *inp = tptoinpcb(tp);
  619 
  620         INP_WLOCK_ASSERT(inp);
  621 
  622         lctx = listen_hash_del(sc, inp);
  623         if (lctx == NULL)
  624                 return (ENOENT);        /* no hardware listener for this inp */
  625 
  626         CTR4(KTR_CXGBE, "%s: stid %u, lctx %p, flags %x", __func__, lctx->stid,
  627             lctx, lctx->flags);
  628 
  629         /*
  630          * If the reply to the PASS_OPEN is still pending we'll wait for it to
  631          * arrive and clean up when it does.
  632          */
  633         if (lctx->flags & LCTX_RPL_PENDING) {
  634                 return (EINPROGRESS);
  635         }
  636 
  637         destroy_server(sc, lctx);
  638         return (0);
  639 }
  640 
  641 static inline struct synq_entry *
  642 alloc_synqe(struct adapter *sc __unused, struct listen_ctx *lctx, int flags)
  643 {
  644         struct synq_entry *synqe;
  645 
  646         INP_RLOCK_ASSERT(lctx->inp);
  647         MPASS(flags == M_WAITOK || flags == M_NOWAIT);
  648 
  649         synqe = malloc(sizeof(*synqe), M_CXGBE, flags);
  650         if (__predict_true(synqe != NULL)) {
  651                 synqe->flags = TPF_SYNQE;
  652                 refcount_init(&synqe->refcnt, 1);
  653                 synqe->lctx = lctx;
  654                 hold_lctx(lctx);        /* Every synqe has a ref on its lctx. */
  655                 synqe->syn = NULL;
  656         }
  657 
  658         return (synqe);
  659 }
  660 
  661 static inline void
  662 hold_synqe(struct synq_entry *synqe)
  663 {
  664 
  665         refcount_acquire(&synqe->refcnt);
  666 }
  667 
  668 static inline struct inpcb *
  669 release_synqe(struct adapter *sc, struct synq_entry *synqe)
  670 {
  671         struct inpcb *inp;
  672 
  673         MPASS(synqe->flags & TPF_SYNQE);
  674         MPASS(synqe->lctx != NULL);
  675 
  676         inp = synqe->lctx->inp;
  677         MPASS(inp != NULL);
  678         INP_WLOCK_ASSERT(inp);
  679 
  680         if (refcount_release(&synqe->refcnt)) {
  681                 inp = release_lctx(sc, synqe->lctx);
  682                 m_freem(synqe->syn);
  683                 free(synqe, M_CXGBE);
  684         }
  685 
  686         return (inp);
  687 }
  688 
  689 void
  690 t4_syncache_added(struct toedev *tod __unused, void *arg)
  691 {
  692         struct synq_entry *synqe = arg;
  693 
  694         hold_synqe(synqe);
  695 }
  696 
  697 void
  698 t4_syncache_removed(struct toedev *tod, void *arg)
  699 {
  700         struct adapter *sc = tod->tod_softc;
  701         struct synq_entry *synqe = arg;
  702         struct inpcb *inp = synqe->lctx->inp;
  703 
  704         /*
  705          * XXX: this is a LOR but harmless when running from the softclock.
  706          */
  707         INP_WLOCK(inp);
  708         inp = release_synqe(sc, synqe);
  709         if (inp != NULL)
  710                 INP_WUNLOCK(inp);
  711 }
  712 
  713 int
  714 t4_syncache_respond(struct toedev *tod, void *arg, struct mbuf *m)
  715 {
  716         struct synq_entry *synqe = arg;
  717 
  718         if (atomic_fetchadd_int(&synqe->ok_to_respond, 1) == 0) {
  719                 struct tcpopt to;
  720                 struct ip *ip = mtod(m, struct ip *);
  721                 struct tcphdr *th;
  722 
  723                 if (ip->ip_v == IPVERSION)
  724                         th = (void *)(ip + 1);
  725                 else
  726                         th = (void *)((struct ip6_hdr *)ip + 1);
  727                 bzero(&to, sizeof(to));
  728                 tcp_dooptions(&to, (void *)(th + 1),
  729                     (th->th_off << 2) - sizeof(*th), TO_SYN);
  730 
  731                 /* save these for later */
  732                 synqe->iss = be32toh(th->th_seq);
  733                 synqe->irs = be32toh(th->th_ack) - 1;
  734                 synqe->ts = to.to_tsval;
  735         }
  736 
  737         m_freem(m);     /* don't need this any more */
  738         return (0);
  739 }
  740 
  741 static int
  742 do_pass_open_rpl(struct sge_iq *iq, const struct rss_header *rss,
  743     struct mbuf *m)
  744 {
  745         struct adapter *sc = iq->adapter;
  746         const struct cpl_pass_open_rpl *cpl = (const void *)(rss + 1);
  747         int stid = GET_TID(cpl);
  748         unsigned int status = cpl->status;
  749         struct listen_ctx *lctx = lookup_stid(sc, stid);
  750         struct inpcb *inp = lctx->inp;
  751 #ifdef INVARIANTS
  752         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
  753 #endif
  754 
  755         KASSERT(opcode == CPL_PASS_OPEN_RPL,
  756             ("%s: unexpected opcode 0x%x", __func__, opcode));
  757         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
  758         KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
  759 
  760         INP_WLOCK(inp);
  761 
  762         CTR4(KTR_CXGBE, "%s: stid %d, status %u, flags 0x%x",
  763             __func__, stid, status, lctx->flags);
  764 
  765         lctx->flags &= ~LCTX_RPL_PENDING;
  766 
  767         if (status != CPL_ERR_NONE)
  768                 log(LOG_ERR, "listener (stid %u) failed: %d\n", stid, status);
  769 
  770 #ifdef INVARIANTS
  771         /*
  772          * If the inp has been dropped (listening socket closed) then
  773          * listen_stop must have run and taken the inp out of the hash.
  774          */
  775         if (inp->inp_flags & INP_DROPPED) {
  776                 KASSERT(listen_hash_del(sc, inp) == NULL,
  777                     ("%s: inp %p still in listen hash", __func__, inp));
  778         }
  779 #endif
  780 
  781         if (inp->inp_flags & INP_DROPPED && status != CPL_ERR_NONE) {
  782                 if (release_lctx(sc, lctx) != NULL)
  783                         INP_WUNLOCK(inp);
  784                 return (status);
  785         }
  786 
  787         /*
  788          * Listening socket stopped listening earlier and now the chip tells us
  789          * it has started the hardware listener.  Stop it; the lctx will be
  790          * released in do_close_server_rpl.
  791          */
  792         if (inp->inp_flags & INP_DROPPED) {
  793                 destroy_server(sc, lctx);
  794                 INP_WUNLOCK(inp);
  795                 return (status);
  796         }
  797 
  798         /*
  799          * Failed to start hardware listener.  Take inp out of the hash and
  800          * release our reference on it.  An error message has been logged
  801          * already.
  802          */
  803         if (status != CPL_ERR_NONE) {
  804                 listen_hash_del(sc, inp);
  805                 if (release_lctx(sc, lctx) != NULL)
  806                         INP_WUNLOCK(inp);
  807                 return (status);
  808         }
  809 
  810         /* hardware listener open for business */
  811 
  812         INP_WUNLOCK(inp);
  813         return (status);
  814 }
  815 
  816 static int
  817 do_close_server_rpl(struct sge_iq *iq, const struct rss_header *rss,
  818     struct mbuf *m)
  819 {
  820         struct adapter *sc = iq->adapter;
  821         const struct cpl_close_listsvr_rpl *cpl = (const void *)(rss + 1);
  822         int stid = GET_TID(cpl);
  823         unsigned int status = cpl->status;
  824         struct listen_ctx *lctx = lookup_stid(sc, stid);
  825         struct inpcb *inp = lctx->inp;
  826 #ifdef INVARIANTS
  827         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
  828 #endif
  829 
  830         KASSERT(opcode == CPL_CLOSE_LISTSRV_RPL,
  831             ("%s: unexpected opcode 0x%x", __func__, opcode));
  832         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
  833         KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
  834 
  835         CTR3(KTR_CXGBE, "%s: stid %u, status %u", __func__, stid, status);
  836 
  837         if (status != CPL_ERR_NONE) {
  838                 log(LOG_ERR, "%s: failed (%u) to close listener for stid %u\n",
  839                     __func__, status, stid);
  840                 return (status);
  841         }
  842 
  843         INP_WLOCK(inp);
  844         inp = release_lctx(sc, lctx);
  845         if (inp != NULL)
  846                 INP_WUNLOCK(inp);
  847 
  848         return (status);
  849 }
  850 
  851 static void
  852 done_with_synqe(struct adapter *sc, struct synq_entry *synqe)
  853 {
  854         struct listen_ctx *lctx = synqe->lctx;
  855         struct inpcb *inp = lctx->inp;
  856         struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
  857         int ntids;
  858 
  859         INP_WLOCK_ASSERT(inp);
  860         ntids = inp->inp_vflag & INP_IPV6 ? 2 : 1;
  861 
  862         remove_tid(sc, synqe->tid, ntids);
  863         release_tid(sc, synqe->tid, lctx->ctrlq);
  864         t4_l2t_release(e);
  865         inp = release_synqe(sc, synqe);
  866         if (inp)
  867                 INP_WUNLOCK(inp);
  868 }
  869 
  870 void
  871 synack_failure_cleanup(struct adapter *sc, int tid)
  872 {
  873         struct synq_entry *synqe = lookup_tid(sc, tid);
  874 
  875         INP_WLOCK(synqe->lctx->inp);
  876         done_with_synqe(sc, synqe);
  877 }
  878 
  879 int
  880 do_abort_req_synqe(struct sge_iq *iq, const struct rss_header *rss,
  881     struct mbuf *m)
  882 {
  883         struct adapter *sc = iq->adapter;
  884         const struct cpl_abort_req_rss *cpl = (const void *)(rss + 1);
  885         unsigned int tid = GET_TID(cpl);
  886         struct synq_entry *synqe = lookup_tid(sc, tid);
  887         struct listen_ctx *lctx = synqe->lctx;
  888         struct inpcb *inp = lctx->inp;
  889         struct sge_ofld_txq *ofld_txq;
  890 #ifdef INVARIANTS
  891         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
  892 #endif
  893 
  894         KASSERT(opcode == CPL_ABORT_REQ_RSS,
  895             ("%s: unexpected opcode 0x%x", __func__, opcode));
  896         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
  897         KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
  898 
  899         CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
  900             __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
  901 
  902         if (negative_advice(cpl->status))
  903                 return (0);     /* Ignore negative advice */
  904 
  905         INP_WLOCK(inp);
  906 
  907         ofld_txq = &sc->sge.ofld_txq[synqe->params.txq_idx];
  908 
  909         if (!(synqe->flags & TPF_FLOWC_WR_SENT))
  910                 send_flowc_wr_synqe(sc, synqe);
  911 
  912         /*
  913          * If we'd initiated an abort earlier the reply to it is responsible for
  914          * cleaning up resources.  Otherwise we tear everything down right here
  915          * right now.  We owe the T4 a CPL_ABORT_RPL no matter what.
  916          */
  917         if (synqe->flags & TPF_ABORT_SHUTDOWN) {
  918                 INP_WUNLOCK(inp);
  919                 goto done;
  920         }
  921 
  922         done_with_synqe(sc, synqe);
  923         /* inp lock released by done_with_synqe */
  924 done:
  925         send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST);
  926         return (0);
  927 }
  928 
  929 int
  930 do_abort_rpl_synqe(struct sge_iq *iq, const struct rss_header *rss,
  931     struct mbuf *m)
  932 {
  933         struct adapter *sc = iq->adapter;
  934         const struct cpl_abort_rpl_rss *cpl = (const void *)(rss + 1);
  935         unsigned int tid = GET_TID(cpl);
  936         struct synq_entry *synqe = lookup_tid(sc, tid);
  937         struct listen_ctx *lctx = synqe->lctx;
  938         struct inpcb *inp = lctx->inp;
  939 #ifdef INVARIANTS
  940         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
  941 #endif
  942 
  943         KASSERT(opcode == CPL_ABORT_RPL_RSS,
  944             ("%s: unexpected opcode 0x%x", __func__, opcode));
  945         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
  946         KASSERT(synqe->tid == tid, ("%s: toep tid mismatch", __func__));
  947 
  948         CTR6(KTR_CXGBE, "%s: tid %u, synqe %p (0x%x), lctx %p, status %d",
  949             __func__, tid, synqe, synqe->flags, synqe->lctx, cpl->status);
  950 
  951         INP_WLOCK(inp);
  952         KASSERT(synqe->flags & TPF_ABORT_SHUTDOWN,
  953             ("%s: wasn't expecting abort reply for synqe %p (0x%x)",
  954             __func__, synqe, synqe->flags));
  955 
  956         done_with_synqe(sc, synqe);
  957         /* inp lock released by done_with_synqe */
  958 
  959         return (0);
  960 }
  961 
  962 void
  963 t4_offload_socket(struct toedev *tod, void *arg, struct socket *so)
  964 {
  965         struct adapter *sc = tod->tod_softc;
  966         struct synq_entry *synqe = arg;
  967         struct inpcb *inp = sotoinpcb(so);
  968         struct toepcb *toep = synqe->toep;
  969 
  970         NET_EPOCH_ASSERT();     /* prevents bad race with accept() */
  971         INP_WLOCK_ASSERT(inp);
  972         KASSERT(synqe->flags & TPF_SYNQE,
  973             ("%s: %p not a synq_entry?", __func__, arg));
  974         MPASS(toep->tid == synqe->tid);
  975 
  976         offload_socket(so, toep);
  977         make_established(toep, synqe->iss, synqe->irs, synqe->tcp_opt);
  978         toep->flags |= TPF_CPL_PENDING;
  979         update_tid(sc, synqe->tid, toep);
  980         synqe->flags |= TPF_SYNQE_EXPANDED;
  981         inp->inp_flowtype = (inp->inp_vflag & INP_IPV6) ?
  982             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4;
  983         inp->inp_flowid = synqe->rss_hash;
  984 }
  985 
  986 static void
  987 t4opt_to_tcpopt(const struct tcp_options *t4opt, struct tcpopt *to)
  988 {
  989         bzero(to, sizeof(*to));
  990 
  991         if (t4opt->mss) {
  992                 to->to_flags |= TOF_MSS;
  993                 to->to_mss = be16toh(t4opt->mss);
  994         }
  995 
  996         if (t4opt->wsf > 0 && t4opt->wsf < 15) {
  997                 to->to_flags |= TOF_SCALE;
  998                 to->to_wscale = t4opt->wsf;
  999         }
 1000 
 1001         if (t4opt->tstamp)
 1002                 to->to_flags |= TOF_TS;
 1003 
 1004         if (t4opt->sack)
 1005                 to->to_flags |= TOF_SACKPERM;
 1006 }
 1007 
 1008 static bool
 1009 encapsulated_syn(struct adapter *sc, const struct cpl_pass_accept_req *cpl)
 1010 {
 1011         u_int hlen = be32toh(cpl->hdr_len);
 1012 
 1013         if (chip_id(sc) >= CHELSIO_T6)
 1014                 return (G_T6_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
 1015         else
 1016                 return (G_ETH_HDR_LEN(hlen) > sizeof(struct ether_vlan_header));
 1017 }
 1018 
 1019 static void
 1020 pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m,
 1021     struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos)
 1022 {
 1023         const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 1024         const struct ether_header *eh;
 1025         unsigned int hlen = be32toh(cpl->hdr_len);
 1026         uintptr_t l3hdr;
 1027         const struct tcphdr *tcp;
 1028 
 1029         eh = (const void *)(cpl + 1);
 1030         if (chip_id(sc) >= CHELSIO_T6) {
 1031                 l3hdr = ((uintptr_t)eh + G_T6_ETH_HDR_LEN(hlen));
 1032                 tcp = (const void *)(l3hdr + G_T6_IP_HDR_LEN(hlen));
 1033         } else {
 1034                 l3hdr = ((uintptr_t)eh + G_ETH_HDR_LEN(hlen));
 1035                 tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen));
 1036         }
 1037 
 1038         /* extract TOS (DiffServ + ECN) byte for AccECN */
 1039         if (iptos) {
 1040                 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
 1041                         const struct ip *ip = (const void *)l3hdr;
 1042                         *iptos = ip->ip_tos;
 1043                 }
 1044 #ifdef INET6
 1045                 else
 1046                 if (((struct ip *)l3hdr)->ip_v == (IPV6_VERSION >> 4)) {
 1047                         const struct ip6_hdr *ip6 = (const void *)l3hdr;
 1048                         *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 1049                 }
 1050 #endif /* INET */
 1051         }
 1052 
 1053         if (inc) {
 1054                 bzero(inc, sizeof(*inc));
 1055                 inc->inc_fport = tcp->th_sport;
 1056                 inc->inc_lport = tcp->th_dport;
 1057                 if (((struct ip *)l3hdr)->ip_v == IPVERSION) {
 1058                         const struct ip *ip = (const void *)l3hdr;
 1059 
 1060                         inc->inc_faddr = ip->ip_src;
 1061                         inc->inc_laddr = ip->ip_dst;
 1062                 } else {
 1063                         const struct ip6_hdr *ip6 = (const void *)l3hdr;
 1064 
 1065                         inc->inc_flags |= INC_ISIPV6;
 1066                         inc->inc6_faddr = ip6->ip6_src;
 1067                         inc->inc6_laddr = ip6->ip6_dst;
 1068                 }
 1069         }
 1070 
 1071         if (th) {
 1072                 bcopy(tcp, th, sizeof(*th));
 1073                 tcp_fields_to_host(th);         /* just like tcp_input */
 1074         }
 1075 }
 1076 
 1077 static struct l2t_entry *
 1078 get_l2te_for_nexthop(struct port_info *pi, struct ifnet *ifp,
 1079     struct in_conninfo *inc)
 1080 {
 1081         struct l2t_entry *e;
 1082         struct sockaddr_in6 sin6;
 1083         struct sockaddr *dst = (void *)&sin6;
 1084         struct nhop_object *nh;
 1085 
 1086         if (inc->inc_flags & INC_ISIPV6) {
 1087                 bzero(dst, sizeof(struct sockaddr_in6));
 1088                 dst->sa_len = sizeof(struct sockaddr_in6);
 1089                 dst->sa_family = AF_INET6;
 1090 
 1091                 if (IN6_IS_ADDR_LINKLOCAL(&inc->inc6_laddr)) {
 1092                         /* no need for route lookup */
 1093                         e = t4_l2t_get(pi, ifp, dst);
 1094                         return (e);
 1095                 }
 1096 
 1097                 nh = fib6_lookup(RT_DEFAULT_FIB, &inc->inc6_faddr, 0, NHR_NONE, 0);
 1098                 if (nh == NULL)
 1099                         return (NULL);
 1100                 if (nh->nh_ifp != ifp)
 1101                         return (NULL);
 1102                 if (nh->nh_flags & NHF_GATEWAY)
 1103                         ((struct sockaddr_in6 *)dst)->sin6_addr = nh->gw6_sa.sin6_addr;
 1104                 else
 1105                         ((struct sockaddr_in6 *)dst)->sin6_addr = inc->inc6_faddr;
 1106         } else {
 1107                 dst->sa_len = sizeof(struct sockaddr_in);
 1108                 dst->sa_family = AF_INET;
 1109 
 1110                 nh = fib4_lookup(RT_DEFAULT_FIB, inc->inc_faddr, 0, NHR_NONE, 0);
 1111                 if (nh == NULL)
 1112                         return (NULL);
 1113                 if (nh->nh_ifp != ifp)
 1114                         return (NULL);
 1115                 if (nh->nh_flags & NHF_GATEWAY)
 1116                         if (nh->gw_sa.sa_family == AF_INET)
 1117                                 ((struct sockaddr_in *)dst)->sin_addr = nh->gw4_sa.sin_addr;
 1118                         else
 1119                                 *((struct sockaddr_in6 *)dst) = nh->gw6_sa;
 1120                 else
 1121                         ((struct sockaddr_in *)dst)->sin_addr = inc->inc_faddr;
 1122         }
 1123 
 1124         e = t4_l2t_get(pi, ifp, dst);
 1125         return (e);
 1126 }
 1127 
 1128 static int
 1129 send_synack(struct adapter *sc, struct synq_entry *synqe, uint64_t opt0,
 1130     uint32_t opt2, int tid)
 1131 {
 1132         struct wrqe *wr;
 1133         struct cpl_pass_accept_rpl *rpl;
 1134         struct l2t_entry *e = &sc->l2t->l2tab[synqe->params.l2t_idx];
 1135 
 1136         wr = alloc_wrqe(is_t4(sc) ? sizeof(struct cpl_pass_accept_rpl) :
 1137             sizeof(struct cpl_t5_pass_accept_rpl), &sc->sge.ctrlq[0]);
 1138         if (wr == NULL)
 1139                 return (ENOMEM);
 1140         rpl = wrtod(wr);
 1141 
 1142         if (is_t4(sc))
 1143                 INIT_TP_WR_MIT_CPL(rpl, CPL_PASS_ACCEPT_RPL, tid);
 1144         else {
 1145                 struct cpl_t5_pass_accept_rpl *rpl5 = (void *)rpl;
 1146 
 1147                 INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid);
 1148                 rpl5->iss = htobe32(synqe->iss);
 1149         }
 1150         rpl->opt0 = opt0;
 1151         rpl->opt2 = opt2;
 1152 
 1153         return (t4_l2t_send(sc, wr, e));
 1154 }
 1155 
 1156 #define REJECT_PASS_ACCEPT_REQ(tunnel)  do { \
 1157         if (!tunnel) { \
 1158                 m_freem(m); \
 1159                 m = NULL; \
 1160         } \
 1161         reject_reason = __LINE__; \
 1162         goto reject; \
 1163 } while (0)
 1164 
 1165 /*
 1166  * The context associated with a tid entry via insert_tid could be a synq_entry
 1167  * or a toepcb.  The only way CPL handlers can tell is via a bit in these flags.
 1168  */
 1169 CTASSERT(offsetof(struct toepcb, flags) == offsetof(struct synq_entry, flags));
 1170 
 1171 /*
 1172  * Incoming SYN on a listening socket.
 1173  *
 1174  * XXX: Every use of ifp in this routine has a bad race with up/down, toe/-toe,
 1175  * etc.
 1176  */
 1177 static int
 1178 do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss,
 1179     struct mbuf *m)
 1180 {
 1181         struct adapter *sc = iq->adapter;
 1182         struct toedev *tod;
 1183         const struct cpl_pass_accept_req *cpl = mtod(m, const void *);
 1184         unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 1185         unsigned int tid = GET_TID(cpl);
 1186         struct listen_ctx *lctx = lookup_stid(sc, stid);
 1187         struct inpcb *inp;
 1188         struct socket *so;
 1189         struct in_conninfo inc;
 1190         struct tcphdr th;
 1191         struct tcpopt to;
 1192         struct port_info *pi;
 1193         struct vi_info *vi;
 1194         struct ifnet *hw_ifp, *ifp;
 1195         struct l2t_entry *e = NULL;
 1196         struct synq_entry *synqe = NULL;
 1197         int reject_reason, v, ntids;
 1198         uint16_t vid, l2info;
 1199         struct epoch_tracker et;
 1200 #ifdef INVARIANTS
 1201         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1202 #endif
 1203         struct offload_settings settings;
 1204         uint8_t iptos;
 1205 
 1206         KASSERT(opcode == CPL_PASS_ACCEPT_REQ,
 1207             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1208         KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 1209 
 1210         CTR4(KTR_CXGBE, "%s: stid %u, tid %u, lctx %p", __func__, stid, tid,
 1211             lctx);
 1212 
 1213         /*
 1214          * Figure out the port the SYN arrived on.  We'll look for an exact VI
 1215          * match in a bit but in case we don't find any we'll use the main VI as
 1216          * the incoming ifnet.
 1217          */
 1218         l2info = be16toh(cpl->l2info);
 1219         pi = sc->port[G_SYN_INTF(l2info)];
 1220         hw_ifp = pi->vi[0].ifp;
 1221         m->m_pkthdr.rcvif = hw_ifp;
 1222 
 1223         CURVNET_SET(lctx->vnet);        /* before any potential REJECT */
 1224 
 1225         /*
 1226          * If VXLAN/NVGRE parsing is enabled then SYNs in the inner traffic will
 1227          * also hit the listener.  We don't want to offload those.
 1228          */
 1229         if (encapsulated_syn(sc, cpl)) {
 1230                 REJECT_PASS_ACCEPT_REQ(true);
 1231         }
 1232 
 1233         /*
 1234          * Use the MAC index to lookup the associated VI.  If this SYN didn't
 1235          * match a perfect MAC filter, punt.
 1236          */
 1237         if (!(l2info & F_SYN_XACT_MATCH)) {
 1238                 REJECT_PASS_ACCEPT_REQ(true);
 1239         }
 1240         for_each_vi(pi, v, vi) {
 1241                 if (vi->xact_addr_filt == G_SYN_MAC_IDX(l2info))
 1242                         goto found;
 1243         }
 1244         REJECT_PASS_ACCEPT_REQ(true);
 1245 found:
 1246         hw_ifp = vi->ifp;       /* the cxgbe ifnet */
 1247         m->m_pkthdr.rcvif = hw_ifp;
 1248         tod = TOEDEV(hw_ifp);
 1249 
 1250         /*
 1251          * Don't offload if the peer requested a TCP option that's not known to
 1252          * the silicon.  Send the SYN to the kernel instead.
 1253          */
 1254         if (__predict_false(cpl->tcpopt.unknown))
 1255                 REJECT_PASS_ACCEPT_REQ(true);
 1256 
 1257         /*
 1258          * Figure out if there is a pseudo interface (vlan, lagg, etc.)
 1259          * involved.  Don't offload if the SYN had a VLAN tag and the vid
 1260          * doesn't match anything on this interface.
 1261          *
 1262          * XXX: lagg support, lagg + vlan support.
 1263          */
 1264         vid = EVL_VLANOFTAG(be16toh(cpl->vlan));
 1265         if (vid != 0xfff && vid != 0) {
 1266                 ifp = VLAN_DEVAT(hw_ifp, vid);
 1267                 if (ifp == NULL)
 1268                         REJECT_PASS_ACCEPT_REQ(true);
 1269         } else
 1270                 ifp = hw_ifp;
 1271 
 1272         /*
 1273          * Don't offload if the ifnet that the SYN came in on is not in the same
 1274          * vnet as the listening socket.
 1275          */
 1276         if (lctx->vnet != ifp->if_vnet)
 1277                 REJECT_PASS_ACCEPT_REQ(true);
 1278 
 1279         pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos);
 1280         if (inc.inc_flags & INC_ISIPV6) {
 1281 
 1282                 /* Don't offload if the ifcap isn't enabled */
 1283                 if ((ifp->if_capenable & IFCAP_TOE6) == 0)
 1284                         REJECT_PASS_ACCEPT_REQ(true);
 1285 
 1286                 /*
 1287                  * SYN must be directed to an IP6 address on this ifnet.  This
 1288                  * is more restrictive than in6_localip.
 1289                  */
 1290                 NET_EPOCH_ENTER(et);
 1291                 if (!in6_ifhasaddr(ifp, &inc.inc6_laddr)) {
 1292                         NET_EPOCH_EXIT(et);
 1293                         REJECT_PASS_ACCEPT_REQ(true);
 1294                 }
 1295 
 1296                 ntids = 2;
 1297         } else {
 1298 
 1299                 /* Don't offload if the ifcap isn't enabled */
 1300                 if ((ifp->if_capenable & IFCAP_TOE4) == 0)
 1301                         REJECT_PASS_ACCEPT_REQ(true);
 1302 
 1303                 /*
 1304                  * SYN must be directed to an IP address on this ifnet.  This
 1305                  * is more restrictive than in_localip.
 1306                  */
 1307                 NET_EPOCH_ENTER(et);
 1308                 if (!in_ifhasaddr(ifp, inc.inc_laddr)) {
 1309                         NET_EPOCH_EXIT(et);
 1310                         REJECT_PASS_ACCEPT_REQ(true);
 1311                 }
 1312 
 1313                 ntids = 1;
 1314         }
 1315 
 1316         e = get_l2te_for_nexthop(pi, ifp, &inc);
 1317         if (e == NULL) {
 1318                 NET_EPOCH_EXIT(et);
 1319                 REJECT_PASS_ACCEPT_REQ(true);
 1320         }
 1321 
 1322         /* Don't offload if the 4-tuple is already in use */
 1323         if (toe_4tuple_check(&inc, &th, ifp) != 0) {
 1324                 NET_EPOCH_EXIT(et);
 1325                 REJECT_PASS_ACCEPT_REQ(false);
 1326         }
 1327 
 1328         inp = lctx->inp;                /* listening socket, not owned by TOE */
 1329         INP_RLOCK(inp);
 1330 
 1331         /* Don't offload if the listening socket has closed */
 1332         if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 1333                 INP_RUNLOCK(inp);
 1334                 NET_EPOCH_EXIT(et);
 1335                 REJECT_PASS_ACCEPT_REQ(false);
 1336         }
 1337         so = inp->inp_socket;
 1338         rw_rlock(&sc->policy_lock);
 1339         settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m,
 1340             EVL_MAKETAG(0xfff, 0, 0), inp);
 1341         rw_runlock(&sc->policy_lock);
 1342         if (!settings.offload) {
 1343                 INP_RUNLOCK(inp);
 1344                 NET_EPOCH_EXIT(et);
 1345                 REJECT_PASS_ACCEPT_REQ(true);   /* Rejected by COP. */
 1346         }
 1347 
 1348         synqe = alloc_synqe(sc, lctx, M_NOWAIT);
 1349         if (synqe == NULL) {
 1350                 INP_RUNLOCK(inp);
 1351                 NET_EPOCH_EXIT(et);
 1352                 REJECT_PASS_ACCEPT_REQ(true);
 1353         }
 1354         MPASS(rss->hash_type == RSS_HASH_TCP);
 1355         synqe->rss_hash = be32toh(rss->hash_val);
 1356         atomic_store_int(&synqe->ok_to_respond, 0);
 1357 
 1358         init_conn_params(vi, &settings, &inc, so, &cpl->tcpopt, e->idx,
 1359             &synqe->params);
 1360 
 1361         /*
 1362          * If all goes well t4_syncache_respond will get called during
 1363          * syncache_add.  Note that syncache_add releases the pcb lock.
 1364          */
 1365         t4opt_to_tcpopt(&cpl->tcpopt, &to);
 1366         toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos);
 1367 
 1368         if (atomic_load_int(&synqe->ok_to_respond) > 0) {
 1369                 uint64_t opt0;
 1370                 uint32_t opt2;
 1371 
 1372                 opt0 = calc_options0(vi, &synqe->params);
 1373                 opt2 = calc_options2(vi, &synqe->params);
 1374 
 1375                 insert_tid(sc, tid, synqe, ntids);
 1376                 synqe->tid = tid;
 1377                 synqe->syn = m;
 1378                 m = NULL;
 1379 
 1380                 if (send_synack(sc, synqe, opt0, opt2, tid) != 0) {
 1381                         remove_tid(sc, tid, ntids);
 1382                         m = synqe->syn;
 1383                         synqe->syn = NULL;
 1384                         NET_EPOCH_EXIT(et);
 1385                         REJECT_PASS_ACCEPT_REQ(true);
 1386                 }
 1387 
 1388                 CTR6(KTR_CXGBE,
 1389                     "%s: stid %u, tid %u, synqe %p, opt0 %#016lx, opt2 %#08x",
 1390                     __func__, stid, tid, synqe, be64toh(opt0), be32toh(opt2));
 1391         } else {
 1392                 NET_EPOCH_EXIT(et);
 1393                 REJECT_PASS_ACCEPT_REQ(false);
 1394         }
 1395 
 1396         NET_EPOCH_EXIT(et);
 1397         CURVNET_RESTORE();
 1398         return (0);
 1399 reject:
 1400         CURVNET_RESTORE();
 1401         CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid,
 1402             reject_reason);
 1403 
 1404         if (e)
 1405                 t4_l2t_release(e);
 1406         release_tid(sc, tid, lctx->ctrlq);
 1407         if (synqe) {
 1408                 inp = synqe->lctx->inp;
 1409                 INP_WLOCK(inp);
 1410                 inp = release_synqe(sc, synqe);
 1411                 if (inp)
 1412                         INP_WUNLOCK(inp);
 1413         }
 1414 
 1415         if (m) {
 1416                 /*
 1417                  * The connection request hit a TOE listener but is being passed
 1418                  * on to the kernel sw stack instead of getting offloaded.
 1419                  */
 1420                 m_adj(m, sizeof(*cpl));
 1421                 m->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID |
 1422                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 1423                 m->m_pkthdr.csum_data = 0xffff;
 1424                 hw_ifp->if_input(hw_ifp, m);
 1425         }
 1426 
 1427         return (reject_reason);
 1428 }
 1429 
 1430 static void
 1431 synqe_to_protohdrs(struct adapter *sc, struct synq_entry *synqe,
 1432     const struct cpl_pass_establish *cpl, struct in_conninfo *inc,
 1433     struct tcphdr *th, struct tcpopt *to)
 1434 {
 1435         uint16_t tcp_opt = be16toh(cpl->tcp_opt);
 1436         uint8_t iptos;
 1437 
 1438         /* start off with the original SYN */
 1439         pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos);
 1440 
 1441         /* modify parts to make it look like the ACK to our SYN|ACK */
 1442         th->th_flags = TH_ACK;
 1443         th->th_ack = synqe->iss + 1;
 1444         th->th_seq = be32toh(cpl->rcv_isn);
 1445         bzero(to, sizeof(*to));
 1446         if (G_TCPOPT_TSTAMP(tcp_opt)) {
 1447                 to->to_flags |= TOF_TS;
 1448                 to->to_tsecr = synqe->ts;
 1449         }
 1450 }
 1451 
 1452 static int
 1453 do_pass_establish(struct sge_iq *iq, const struct rss_header *rss,
 1454     struct mbuf *m)
 1455 {
 1456         struct adapter *sc = iq->adapter;
 1457         struct vi_info *vi;
 1458         struct ifnet *ifp;
 1459         const struct cpl_pass_establish *cpl = (const void *)(rss + 1);
 1460 #if defined(KTR) || defined(INVARIANTS)
 1461         unsigned int stid = G_PASS_OPEN_TID(be32toh(cpl->tos_stid));
 1462 #endif
 1463         unsigned int tid = GET_TID(cpl);
 1464         struct synq_entry *synqe = lookup_tid(sc, tid);
 1465         struct listen_ctx *lctx = synqe->lctx;
 1466         struct inpcb *inp = lctx->inp, *new_inp;
 1467         struct socket *so;
 1468         struct tcphdr th;
 1469         struct tcpopt to;
 1470         struct in_conninfo inc;
 1471         struct toepcb *toep;
 1472         struct epoch_tracker et;
 1473         int rstreason;
 1474 #ifdef INVARIANTS
 1475         unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl)));
 1476 #endif
 1477 
 1478         KASSERT(opcode == CPL_PASS_ESTABLISH,
 1479             ("%s: unexpected opcode 0x%x", __func__, opcode));
 1480         KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 1481         KASSERT(lctx->stid == stid, ("%s: lctx stid mismatch", __func__));
 1482         KASSERT(synqe->flags & TPF_SYNQE,
 1483             ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe));
 1484 
 1485         CURVNET_SET(lctx->vnet);
 1486         NET_EPOCH_ENTER(et);    /* for syncache_expand */
 1487         INP_WLOCK(inp);
 1488 
 1489         CTR6(KTR_CXGBE,
 1490             "%s: stid %u, tid %u, synqe %p (0x%x), inp_flags 0x%x",
 1491             __func__, stid, tid, synqe, synqe->flags, inp->inp_flags);
 1492 
 1493         ifp = synqe->syn->m_pkthdr.rcvif;
 1494         vi = ifp->if_softc;
 1495         KASSERT(vi->adapter == sc,
 1496             ("%s: vi %p, sc %p mismatch", __func__, vi, sc));
 1497 
 1498         if (__predict_false(inp->inp_flags & INP_DROPPED)) {
 1499 reset:
 1500                 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_SEND_RST);
 1501                 INP_WUNLOCK(inp);
 1502                 NET_EPOCH_EXIT(et);
 1503                 CURVNET_RESTORE();
 1504                 return (0);
 1505         }
 1506 
 1507         KASSERT(synqe->params.rxq_idx == iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0],
 1508             ("%s: CPL arrived on unexpected rxq.  %d %d", __func__,
 1509             synqe->params.rxq_idx,
 1510             (int)(iq_to_ofld_rxq(iq) - &sc->sge.ofld_rxq[0])));
 1511 
 1512         toep = alloc_toepcb(vi, M_NOWAIT);
 1513         if (toep == NULL)
 1514                 goto reset;
 1515         toep->tid = tid;
 1516         toep->l2te = &sc->l2t->l2tab[synqe->params.l2t_idx];
 1517         toep->vnet = lctx->vnet;
 1518         bcopy(&synqe->params, &toep->params, sizeof(toep->params));
 1519         init_toepcb(vi, toep);
 1520 
 1521         MPASS(be32toh(cpl->snd_isn) - 1 == synqe->iss);
 1522         MPASS(be32toh(cpl->rcv_isn) - 1 == synqe->irs);
 1523         synqe->tcp_opt = cpl->tcp_opt;
 1524         synqe->toep = toep;
 1525 
 1526         /* Come up with something that syncache_expand should be ok with. */
 1527         synqe_to_protohdrs(sc, synqe, cpl, &inc, &th, &to);
 1528         if (inc.inc_flags & INC_ISIPV6) {
 1529                 if (lctx->ce == NULL) {
 1530                         toep->ce = t4_get_clip_entry(sc, &inc.inc6_laddr, true);
 1531                         if (toep->ce == NULL) {
 1532                                 free_toepcb(toep);
 1533                                 goto reset;     /* RST without a CLIP entry? */
 1534                         }
 1535                 } else {
 1536                         t4_hold_clip_entry(sc, lctx->ce);
 1537                         toep->ce = lctx->ce;
 1538                 }
 1539         }
 1540         so = inp->inp_socket;
 1541         KASSERT(so != NULL, ("%s: socket is NULL", __func__));
 1542 
 1543         rstreason = toe_syncache_expand(&inc, &to, &th, &so);
 1544         if (rstreason < 0) {
 1545                 free_toepcb(toep);
 1546                 send_abort_rpl_synqe(TOEDEV(ifp), synqe, CPL_ABORT_NO_RST);
 1547                 INP_WUNLOCK(inp);
 1548                 NET_EPOCH_EXIT(et);
 1549                 CURVNET_RESTORE();
 1550                 return (0);
 1551         } else if (rstreason == 0 || so == NULL) {
 1552                 free_toepcb(toep);
 1553                 goto reset;
 1554         }
 1555 
 1556         /* New connection inpcb is already locked by syncache_expand(). */
 1557         new_inp = sotoinpcb(so);
 1558         INP_WLOCK_ASSERT(new_inp);
 1559         MPASS(so->so_vnet == lctx->vnet);
 1560 
 1561         /*
 1562          * This is for expansion from syncookies.
 1563          *
 1564          * XXX: we've held the tcbinfo lock throughout so there's no risk of
 1565          * anyone accept'ing a connection before we've installed our hooks, but
 1566          * this somewhat defeats the purpose of having a tod_offload_socket :-(
 1567          */
 1568         if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) {
 1569                 tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0);
 1570                 t4_offload_socket(TOEDEV(ifp), synqe, so);
 1571         }
 1572 
 1573         INP_WUNLOCK(new_inp);
 1574 
 1575         /* Done with the synqe */
 1576         inp = release_synqe(sc, synqe);
 1577         if (inp != NULL)
 1578                 INP_WUNLOCK(inp);
 1579         NET_EPOCH_EXIT(et);
 1580         CURVNET_RESTORE();
 1581 
 1582         return (0);
 1583 }
 1584 
 1585 void
 1586 t4_init_listen_cpl_handlers(void)
 1587 {
 1588 
 1589         t4_register_cpl_handler(CPL_PASS_OPEN_RPL, do_pass_open_rpl);
 1590         t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, do_close_server_rpl);
 1591         t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, do_pass_accept_req);
 1592         t4_register_cpl_handler(CPL_PASS_ESTABLISH, do_pass_establish);
 1593 }
 1594 
 1595 void
 1596 t4_uninit_listen_cpl_handlers(void)
 1597 {
 1598 
 1599         t4_register_cpl_handler(CPL_PASS_OPEN_RPL, NULL);
 1600         t4_register_cpl_handler(CPL_CLOSE_LISTSRV_RPL, NULL);
 1601         t4_register_cpl_handler(CPL_PASS_ACCEPT_REQ, NULL);
 1602         t4_register_cpl_handler(CPL_PASS_ESTABLISH, NULL);
 1603 }
 1604 #endif

Cache object: 3c4026afd06e933cf6939c3add5f11e7


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.