The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_ktls.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause
    3  *
    4  * Copyright (c) 2014-2019 Netflix Inc.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  */
   27 
   28 #include <sys/cdefs.h>
   29 __FBSDID("$FreeBSD$");
   30 
   31 #include "opt_inet.h"
   32 #include "opt_inet6.h"
   33 #include "opt_kern_tls.h"
   34 #include "opt_ratelimit.h"
   35 #include "opt_rss.h"
   36 
   37 #include <sys/param.h>
   38 #include <sys/kernel.h>
   39 #include <sys/domainset.h>
   40 #include <sys/endian.h>
   41 #include <sys/ktls.h>
   42 #include <sys/lock.h>
   43 #include <sys/mbuf.h>
   44 #include <sys/mutex.h>
   45 #include <sys/rmlock.h>
   46 #include <sys/proc.h>
   47 #include <sys/protosw.h>
   48 #include <sys/refcount.h>
   49 #include <sys/smp.h>
   50 #include <sys/socket.h>
   51 #include <sys/socketvar.h>
   52 #include <sys/sysctl.h>
   53 #include <sys/taskqueue.h>
   54 #include <sys/kthread.h>
   55 #include <sys/uio.h>
   56 #include <sys/vmmeter.h>
   57 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
   58 #include <machine/pcb.h>
   59 #endif
   60 #include <machine/vmparam.h>
   61 #include <net/if.h>
   62 #include <net/if_var.h>
   63 #ifdef RSS
   64 #include <net/netisr.h>
   65 #include <net/rss_config.h>
   66 #endif
   67 #include <net/route.h>
   68 #include <net/route/nhop.h>
   69 #if defined(INET) || defined(INET6)
   70 #include <netinet/in.h>
   71 #include <netinet/in_pcb.h>
   72 #endif
   73 #include <netinet/tcp_var.h>
   74 #ifdef TCP_OFFLOAD
   75 #include <netinet/tcp_offload.h>
   76 #endif
   77 #include <opencrypto/cryptodev.h>
   78 #include <opencrypto/ktls.h>
   79 #include <vm/uma_dbg.h>
   80 #include <vm/vm.h>
   81 #include <vm/vm_pageout.h>
   82 #include <vm/vm_page.h>
   83 #include <vm/vm_pagequeue.h>
   84 
   85 struct ktls_wq {
   86         struct mtx      mtx;
   87         STAILQ_HEAD(, mbuf) m_head;
   88         STAILQ_HEAD(, socket) so_head;
   89         bool            running;
   90         int             lastallocfail;
   91 } __aligned(CACHE_LINE_SIZE);
   92 
   93 struct ktls_alloc_thread {
   94         uint64_t wakeups;
   95         uint64_t allocs;
   96         struct thread *td;
   97         int running;
   98 };
   99 
  100 struct ktls_domain_info {
  101         int count;
  102         int cpu[MAXCPU];
  103         struct ktls_alloc_thread alloc_td;
  104 };
  105 
  106 struct ktls_domain_info ktls_domains[MAXMEMDOM];
  107 static struct ktls_wq *ktls_wq;
  108 static struct proc *ktls_proc;
  109 static uma_zone_t ktls_session_zone;
  110 static uma_zone_t ktls_buffer_zone;
  111 static uint16_t ktls_cpuid_lookup[MAXCPU];
  112 static int ktls_init_state;
  113 static struct sx ktls_init_lock;
  114 SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init");
  115 
  116 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  117     "Kernel TLS offload");
  118 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  119     "Kernel TLS offload stats");
  120 
  121 #ifdef RSS
  122 static int ktls_bind_threads = 1;
  123 #else
  124 static int ktls_bind_threads;
  125 #endif
  126 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
  127     &ktls_bind_threads, 0,
  128     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
  129 
  130 static u_int ktls_maxlen = 16384;
  131 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
  132     &ktls_maxlen, 0, "Maximum TLS record size");
  133 
  134 static int ktls_number_threads;
  135 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
  136     &ktls_number_threads, 0,
  137     "Number of TLS threads in thread-pool");
  138 
  139 unsigned int ktls_ifnet_max_rexmit_pct = 2;
  140 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
  141     &ktls_ifnet_max_rexmit_pct, 2,
  142     "Max percent bytes retransmitted before ifnet TLS is disabled");
  143 
  144 static bool ktls_offload_enable;
  145 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
  146     &ktls_offload_enable, 0,
  147     "Enable support for kernel TLS offload");
  148 
  149 static bool ktls_cbc_enable = true;
  150 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
  151     &ktls_cbc_enable, 1,
  152     "Enable support of AES-CBC crypto for kernel TLS");
  153 
  154 static bool ktls_sw_buffer_cache = true;
  155 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
  156     &ktls_sw_buffer_cache, 1,
  157     "Enable caching of output buffers for SW encryption");
  158 
  159 static int ktls_max_alloc = 128;
  160 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
  161     &ktls_max_alloc, 128,
  162     "Max number of 16k buffers to allocate in thread context");
  163 
  164 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
  165 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
  166     &ktls_tasks_active, "Number of active tasks");
  167 
  168 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending);
  169 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD,
  170     &ktls_cnt_tx_pending,
  171     "Number of TLS 1.0 records waiting for earlier TLS records");
  172 
  173 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
  174 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
  175     &ktls_cnt_tx_queued,
  176     "Number of TLS records in queue to tasks for SW encryption");
  177 
  178 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
  179 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
  180     &ktls_cnt_rx_queued,
  181     "Number of TLS sockets in queue to tasks for SW decryption");
  182 
  183 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
  184 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
  185     CTLFLAG_RD, &ktls_offload_total,
  186     "Total successful TLS setups (parameters set)");
  187 
  188 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
  189 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
  190     CTLFLAG_RD, &ktls_offload_enable_calls,
  191     "Total number of TLS enable calls made");
  192 
  193 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
  194 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
  195     &ktls_offload_active, "Total Active TLS sessions");
  196 
  197 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
  198 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
  199     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
  200 
  201 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
  202 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
  203     &ktls_offload_failed_crypto, "Total TLS crypto failures");
  204 
  205 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
  206 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
  207     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
  208 
  209 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
  210 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
  211     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
  212 
  213 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
  214 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
  215     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
  216 
  217 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
  218 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
  219     &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
  220 
  221 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
  222 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
  223     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
  224 
  225 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  226     "Software TLS session stats");
  227 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  228     "Hardware (ifnet) TLS session stats");
  229 #ifdef TCP_OFFLOAD
  230 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  231     "TOE TLS session stats");
  232 #endif
  233 
  234 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
  235 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
  236     "Active number of software TLS sessions using AES-CBC");
  237 
  238 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
  239 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
  240     "Active number of software TLS sessions using AES-GCM");
  241 
  242 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
  243 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
  244     &ktls_sw_chacha20,
  245     "Active number of software TLS sessions using Chacha20-Poly1305");
  246 
  247 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
  248 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
  249     &ktls_ifnet_cbc,
  250     "Active number of ifnet TLS sessions using AES-CBC");
  251 
  252 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
  253 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
  254     &ktls_ifnet_gcm,
  255     "Active number of ifnet TLS sessions using AES-GCM");
  256 
  257 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
  258 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
  259     &ktls_ifnet_chacha20,
  260     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
  261 
  262 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
  263 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
  264     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
  265 
  266 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
  267 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
  268     &ktls_ifnet_reset_dropped,
  269     "TLS sessions dropped after failing to update ifnet send tag");
  270 
  271 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
  272 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
  273     &ktls_ifnet_reset_failed,
  274     "TLS sessions that failed to allocate a new ifnet send tag");
  275 
  276 static int ktls_ifnet_permitted;
  277 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
  278     &ktls_ifnet_permitted, 1,
  279     "Whether to permit hardware (ifnet) TLS sessions");
  280 
  281 #ifdef TCP_OFFLOAD
  282 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
  283 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
  284     &ktls_toe_cbc,
  285     "Active number of TOE TLS sessions using AES-CBC");
  286 
  287 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
  288 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
  289     &ktls_toe_gcm,
  290     "Active number of TOE TLS sessions using AES-GCM");
  291 
  292 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
  293 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
  294     &ktls_toe_chacha20,
  295     "Active number of TOE TLS sessions using Chacha20-Poly1305");
  296 #endif
  297 
  298 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
  299 
  300 #if defined(INET) || defined(INET6)
  301 static void ktls_reset_receive_tag(void *context, int pending);
  302 static void ktls_reset_send_tag(void *context, int pending);
  303 #endif
  304 static void ktls_work_thread(void *ctx);
  305 static void ktls_alloc_thread(void *ctx);
  306 
  307 #if defined(INET) || defined(INET6)
  308 static u_int
  309 ktls_get_cpu(struct socket *so)
  310 {
  311         struct inpcb *inp;
  312 #ifdef NUMA
  313         struct ktls_domain_info *di;
  314 #endif
  315         u_int cpuid;
  316 
  317         inp = sotoinpcb(so);
  318 #ifdef RSS
  319         cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
  320         if (cpuid != NETISR_CPUID_NONE)
  321                 return (cpuid);
  322 #endif
  323         /*
  324          * Just use the flowid to shard connections in a repeatable
  325          * fashion.  Note that TLS 1.0 sessions rely on the
  326          * serialization provided by having the same connection use
  327          * the same queue.
  328          */
  329 #ifdef NUMA
  330         if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
  331                 di = &ktls_domains[inp->inp_numa_domain];
  332                 cpuid = di->cpu[inp->inp_flowid % di->count];
  333         } else
  334 #endif
  335                 cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
  336         return (cpuid);
  337 }
  338 #endif
  339 
  340 static int
  341 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
  342 {
  343         vm_page_t m;
  344         int i, req;
  345 
  346         KASSERT((ktls_maxlen & PAGE_MASK) == 0,
  347             ("%s: ktls max length %d is not page size-aligned",
  348             __func__, ktls_maxlen));
  349 
  350         req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags);
  351         for (i = 0; i < count; i++) {
  352                 m = vm_page_alloc_noobj_contig_domain(domain, req,
  353                     atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
  354                     VM_MEMATTR_DEFAULT);
  355                 if (m == NULL)
  356                         break;
  357                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
  358         }
  359         return (i);
  360 }
  361 
  362 static void
  363 ktls_buffer_release(void *arg __unused, void **store, int count)
  364 {
  365         vm_page_t m;
  366         int i, j;
  367 
  368         for (i = 0; i < count; i++) {
  369                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
  370                 for (j = 0; j < atop(ktls_maxlen); j++) {
  371                         (void)vm_page_unwire_noq(m + j);
  372                         vm_page_free(m + j);
  373                 }
  374         }
  375 }
  376 
  377 static void
  378 ktls_free_mext_contig(struct mbuf *m)
  379 {
  380         M_ASSERTEXTPG(m);
  381         uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
  382 }
  383 
  384 static int
  385 ktls_init(void)
  386 {
  387         struct thread *td;
  388         struct pcpu *pc;
  389         int count, domain, error, i;
  390 
  391         ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
  392             M_WAITOK | M_ZERO);
  393 
  394         ktls_session_zone = uma_zcreate("ktls_session",
  395             sizeof(struct ktls_session),
  396             NULL, NULL, NULL, NULL,
  397             UMA_ALIGN_CACHE, 0);
  398 
  399         if (ktls_sw_buffer_cache) {
  400                 ktls_buffer_zone = uma_zcache_create("ktls_buffers",
  401                     roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
  402                     ktls_buffer_import, ktls_buffer_release, NULL,
  403                     UMA_ZONE_FIRSTTOUCH);
  404         }
  405 
  406         /*
  407          * Initialize the workqueues to run the TLS work.  We create a
  408          * work queue for each CPU.
  409          */
  410         CPU_FOREACH(i) {
  411                 STAILQ_INIT(&ktls_wq[i].m_head);
  412                 STAILQ_INIT(&ktls_wq[i].so_head);
  413                 mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
  414                 if (ktls_bind_threads > 1) {
  415                         pc = pcpu_find(i);
  416                         domain = pc->pc_domain;
  417                         count = ktls_domains[domain].count;
  418                         ktls_domains[domain].cpu[count] = i;
  419                         ktls_domains[domain].count++;
  420                 }
  421                 ktls_cpuid_lookup[ktls_number_threads] = i;
  422                 ktls_number_threads++;
  423         }
  424 
  425         /*
  426          * If we somehow have an empty domain, fall back to choosing
  427          * among all KTLS threads.
  428          */
  429         if (ktls_bind_threads > 1) {
  430                 for (i = 0; i < vm_ndomains; i++) {
  431                         if (ktls_domains[i].count == 0) {
  432                                 ktls_bind_threads = 1;
  433                                 break;
  434                         }
  435                 }
  436         }
  437 
  438         /* Start kthreads for each workqueue. */
  439         CPU_FOREACH(i) {
  440                 error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
  441                     &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
  442                 if (error) {
  443                         printf("Can't add KTLS thread %d error %d\n", i, error);
  444                         return (error);
  445                 }
  446         }
  447 
  448         /*
  449          * Start an allocation thread per-domain to perform blocking allocations
  450          * of 16k physically contiguous TLS crypto destination buffers.
  451          */
  452         if (ktls_sw_buffer_cache) {
  453                 for (domain = 0; domain < vm_ndomains; domain++) {
  454                         if (VM_DOMAIN_EMPTY(domain))
  455                                 continue;
  456                         if (CPU_EMPTY(&cpuset_domain[domain]))
  457                                 continue;
  458                         error = kproc_kthread_add(ktls_alloc_thread,
  459                             &ktls_domains[domain], &ktls_proc,
  460                             &ktls_domains[domain].alloc_td.td,
  461                             0, 0, "KTLS", "alloc_%d", domain);
  462                         if (error) {
  463                                 printf("Can't add KTLS alloc thread %d error %d\n",
  464                                     domain, error);
  465                                 return (error);
  466                         }
  467                 }
  468         }
  469 
  470         if (bootverbose)
  471                 printf("KTLS: Initialized %d threads\n", ktls_number_threads);
  472         return (0);
  473 }
  474 
  475 static int
  476 ktls_start_kthreads(void)
  477 {
  478         int error, state;
  479 
  480 start:
  481         state = atomic_load_acq_int(&ktls_init_state);
  482         if (__predict_true(state > 0))
  483                 return (0);
  484         if (state < 0)
  485                 return (ENXIO);
  486 
  487         sx_xlock(&ktls_init_lock);
  488         if (ktls_init_state != 0) {
  489                 sx_xunlock(&ktls_init_lock);
  490                 goto start;
  491         }
  492 
  493         error = ktls_init();
  494         if (error == 0)
  495                 state = 1;
  496         else
  497                 state = -1;
  498         atomic_store_rel_int(&ktls_init_state, state);
  499         sx_xunlock(&ktls_init_lock);
  500         return (error);
  501 }
  502 
  503 #if defined(INET) || defined(INET6)
  504 static int
  505 ktls_create_session(struct socket *so, struct tls_enable *en,
  506     struct ktls_session **tlsp, int direction)
  507 {
  508         struct ktls_session *tls;
  509         int error;
  510 
  511         /* Only TLS 1.0 - 1.3 are supported. */
  512         if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
  513                 return (EINVAL);
  514         if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
  515             en->tls_vminor > TLS_MINOR_VER_THREE)
  516                 return (EINVAL);
  517 
  518         if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
  519                 return (EINVAL);
  520         if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
  521                 return (EINVAL);
  522         if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
  523                 return (EINVAL);
  524 
  525         /* All supported algorithms require a cipher key. */
  526         if (en->cipher_key_len == 0)
  527                 return (EINVAL);
  528 
  529         /* No flags are currently supported. */
  530         if (en->flags != 0)
  531                 return (EINVAL);
  532 
  533         /* Common checks for supported algorithms. */
  534         switch (en->cipher_algorithm) {
  535         case CRYPTO_AES_NIST_GCM_16:
  536                 /*
  537                  * auth_algorithm isn't used, but permit GMAC values
  538                  * for compatibility.
  539                  */
  540                 switch (en->auth_algorithm) {
  541                 case 0:
  542 #ifdef COMPAT_FREEBSD12
  543                 /* XXX: Really 13.0-current COMPAT. */
  544                 case CRYPTO_AES_128_NIST_GMAC:
  545                 case CRYPTO_AES_192_NIST_GMAC:
  546                 case CRYPTO_AES_256_NIST_GMAC:
  547 #endif
  548                         break;
  549                 default:
  550                         return (EINVAL);
  551                 }
  552                 if (en->auth_key_len != 0)
  553                         return (EINVAL);
  554                 switch (en->tls_vminor) {
  555                 case TLS_MINOR_VER_TWO:
  556                         if (en->iv_len != TLS_AEAD_GCM_LEN)
  557                                 return (EINVAL);
  558                         break;
  559                 case TLS_MINOR_VER_THREE:
  560                         if (en->iv_len != TLS_1_3_GCM_IV_LEN)
  561                                 return (EINVAL);
  562                         break;
  563                 default:
  564                         return (EINVAL);
  565                 }
  566                 break;
  567         case CRYPTO_AES_CBC:
  568                 switch (en->auth_algorithm) {
  569                 case CRYPTO_SHA1_HMAC:
  570                         break;
  571                 case CRYPTO_SHA2_256_HMAC:
  572                 case CRYPTO_SHA2_384_HMAC:
  573                         if (en->tls_vminor != TLS_MINOR_VER_TWO)
  574                                 return (EINVAL);
  575                         break;
  576                 default:
  577                         return (EINVAL);
  578                 }
  579                 if (en->auth_key_len == 0)
  580                         return (EINVAL);
  581 
  582                 /*
  583                  * TLS 1.0 requires an implicit IV.  TLS 1.1 and 1.2
  584                  * use explicit IVs.
  585                  */
  586                 switch (en->tls_vminor) {
  587                 case TLS_MINOR_VER_ZERO:
  588                         if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
  589                                 return (EINVAL);
  590                         break;
  591                 case TLS_MINOR_VER_ONE:
  592                 case TLS_MINOR_VER_TWO:
  593                         /* Ignore any supplied IV. */
  594                         en->iv_len = 0;
  595                         break;
  596                 default:
  597                         return (EINVAL);
  598                 }
  599                 break;
  600         case CRYPTO_CHACHA20_POLY1305:
  601                 if (en->auth_algorithm != 0 || en->auth_key_len != 0)
  602                         return (EINVAL);
  603                 if (en->tls_vminor != TLS_MINOR_VER_TWO &&
  604                     en->tls_vminor != TLS_MINOR_VER_THREE)
  605                         return (EINVAL);
  606                 if (en->iv_len != TLS_CHACHA20_IV_LEN)
  607                         return (EINVAL);
  608                 break;
  609         default:
  610                 return (EINVAL);
  611         }
  612 
  613         error = ktls_start_kthreads();
  614         if (error != 0)
  615                 return (error);
  616 
  617         tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
  618 
  619         counter_u64_add(ktls_offload_active, 1);
  620 
  621         refcount_init(&tls->refcount, 1);
  622         if (direction == KTLS_RX)
  623                 TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls);
  624         else
  625                 TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
  626 
  627         tls->wq_index = ktls_get_cpu(so);
  628 
  629         tls->params.cipher_algorithm = en->cipher_algorithm;
  630         tls->params.auth_algorithm = en->auth_algorithm;
  631         tls->params.tls_vmajor = en->tls_vmajor;
  632         tls->params.tls_vminor = en->tls_vminor;
  633         tls->params.flags = en->flags;
  634         tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
  635 
  636         /* Set the header and trailer lengths. */
  637         tls->params.tls_hlen = sizeof(struct tls_record_layer);
  638         switch (en->cipher_algorithm) {
  639         case CRYPTO_AES_NIST_GCM_16:
  640                 /*
  641                  * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
  642                  * nonce.  TLS 1.3 uses a 12 byte implicit IV.
  643                  */
  644                 if (en->tls_vminor < TLS_MINOR_VER_THREE)
  645                         tls->params.tls_hlen += sizeof(uint64_t);
  646                 tls->params.tls_tlen = AES_GMAC_HASH_LEN;
  647                 tls->params.tls_bs = 1;
  648                 break;
  649         case CRYPTO_AES_CBC:
  650                 switch (en->auth_algorithm) {
  651                 case CRYPTO_SHA1_HMAC:
  652                         if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
  653                                 /* Implicit IV, no nonce. */
  654                                 tls->sequential_records = true;
  655                                 tls->next_seqno = be64dec(en->rec_seq);
  656                                 STAILQ_INIT(&tls->pending_records);
  657                         } else {
  658                                 tls->params.tls_hlen += AES_BLOCK_LEN;
  659                         }
  660                         tls->params.tls_tlen = AES_BLOCK_LEN +
  661                             SHA1_HASH_LEN;
  662                         break;
  663                 case CRYPTO_SHA2_256_HMAC:
  664                         tls->params.tls_hlen += AES_BLOCK_LEN;
  665                         tls->params.tls_tlen = AES_BLOCK_LEN +
  666                             SHA2_256_HASH_LEN;
  667                         break;
  668                 case CRYPTO_SHA2_384_HMAC:
  669                         tls->params.tls_hlen += AES_BLOCK_LEN;
  670                         tls->params.tls_tlen = AES_BLOCK_LEN +
  671                             SHA2_384_HASH_LEN;
  672                         break;
  673                 default:
  674                         panic("invalid hmac");
  675                 }
  676                 tls->params.tls_bs = AES_BLOCK_LEN;
  677                 break;
  678         case CRYPTO_CHACHA20_POLY1305:
  679                 /*
  680                  * Chacha20 uses a 12 byte implicit IV.
  681                  */
  682                 tls->params.tls_tlen = POLY1305_HASH_LEN;
  683                 tls->params.tls_bs = 1;
  684                 break;
  685         default:
  686                 panic("invalid cipher");
  687         }
  688 
  689         /*
  690          * TLS 1.3 includes optional padding which we do not support,
  691          * and also puts the "real" record type at the end of the
  692          * encrypted data.
  693          */
  694         if (en->tls_vminor == TLS_MINOR_VER_THREE)
  695                 tls->params.tls_tlen += sizeof(uint8_t);
  696 
  697         KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
  698             ("TLS header length too long: %d", tls->params.tls_hlen));
  699         KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
  700             ("TLS trailer length too long: %d", tls->params.tls_tlen));
  701 
  702         if (en->auth_key_len != 0) {
  703                 tls->params.auth_key_len = en->auth_key_len;
  704                 tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
  705                     M_WAITOK);
  706                 error = copyin(en->auth_key, tls->params.auth_key,
  707                     en->auth_key_len);
  708                 if (error)
  709                         goto out;
  710         }
  711 
  712         tls->params.cipher_key_len = en->cipher_key_len;
  713         tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
  714         error = copyin(en->cipher_key, tls->params.cipher_key,
  715             en->cipher_key_len);
  716         if (error)
  717                 goto out;
  718 
  719         /*
  720          * This holds the implicit portion of the nonce for AEAD
  721          * ciphers and the initial implicit IV for TLS 1.0.  The
  722          * explicit portions of the IV are generated in ktls_frame().
  723          */
  724         if (en->iv_len != 0) {
  725                 tls->params.iv_len = en->iv_len;
  726                 error = copyin(en->iv, tls->params.iv, en->iv_len);
  727                 if (error)
  728                         goto out;
  729 
  730                 /*
  731                  * For TLS 1.2 with GCM, generate an 8-byte nonce as a
  732                  * counter to generate unique explicit IVs.
  733                  *
  734                  * Store this counter in the last 8 bytes of the IV
  735                  * array so that it is 8-byte aligned.
  736                  */
  737                 if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
  738                     en->tls_vminor == TLS_MINOR_VER_TWO)
  739                         arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
  740         }
  741 
  742         *tlsp = tls;
  743         return (0);
  744 
  745 out:
  746         ktls_free(tls);
  747         return (error);
  748 }
  749 
  750 static struct ktls_session *
  751 ktls_clone_session(struct ktls_session *tls, int direction)
  752 {
  753         struct ktls_session *tls_new;
  754 
  755         tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
  756 
  757         counter_u64_add(ktls_offload_active, 1);
  758 
  759         refcount_init(&tls_new->refcount, 1);
  760         if (direction == KTLS_RX)
  761                 TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag,
  762                     tls_new);
  763         else
  764                 TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag,
  765                     tls_new);
  766 
  767         /* Copy fields from existing session. */
  768         tls_new->params = tls->params;
  769         tls_new->wq_index = tls->wq_index;
  770 
  771         /* Deep copy keys. */
  772         if (tls_new->params.auth_key != NULL) {
  773                 tls_new->params.auth_key = malloc(tls->params.auth_key_len,
  774                     M_KTLS, M_WAITOK);
  775                 memcpy(tls_new->params.auth_key, tls->params.auth_key,
  776                     tls->params.auth_key_len);
  777         }
  778 
  779         tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
  780             M_WAITOK);
  781         memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
  782             tls->params.cipher_key_len);
  783 
  784         return (tls_new);
  785 }
  786 
  787 #ifdef TCP_OFFLOAD
  788 static int
  789 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
  790 {
  791         struct inpcb *inp;
  792         struct tcpcb *tp;
  793         int error;
  794 
  795         inp = so->so_pcb;
  796         INP_WLOCK(inp);
  797         if (inp->inp_flags & INP_DROPPED) {
  798                 INP_WUNLOCK(inp);
  799                 return (ECONNRESET);
  800         }
  801         if (inp->inp_socket == NULL) {
  802                 INP_WUNLOCK(inp);
  803                 return (ECONNRESET);
  804         }
  805         tp = intotcpcb(inp);
  806         if (!(tp->t_flags & TF_TOE)) {
  807                 INP_WUNLOCK(inp);
  808                 return (EOPNOTSUPP);
  809         }
  810 
  811         error = tcp_offload_alloc_tls_session(tp, tls, direction);
  812         INP_WUNLOCK(inp);
  813         if (error == 0) {
  814                 tls->mode = TCP_TLS_MODE_TOE;
  815                 switch (tls->params.cipher_algorithm) {
  816                 case CRYPTO_AES_CBC:
  817                         counter_u64_add(ktls_toe_cbc, 1);
  818                         break;
  819                 case CRYPTO_AES_NIST_GCM_16:
  820                         counter_u64_add(ktls_toe_gcm, 1);
  821                         break;
  822                 case CRYPTO_CHACHA20_POLY1305:
  823                         counter_u64_add(ktls_toe_chacha20, 1);
  824                         break;
  825                 }
  826         }
  827         return (error);
  828 }
  829 #endif
  830 
  831 /*
  832  * Common code used when first enabling ifnet TLS on a connection or
  833  * when allocating a new ifnet TLS session due to a routing change.
  834  * This function allocates a new TLS send tag on whatever interface
  835  * the connection is currently routed over.
  836  */
  837 static int
  838 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
  839     struct m_snd_tag **mstp)
  840 {
  841         union if_snd_tag_alloc_params params;
  842         struct ifnet *ifp;
  843         struct nhop_object *nh;
  844         struct tcpcb *tp;
  845         int error;
  846 
  847         INP_RLOCK(inp);
  848         if (inp->inp_flags & INP_DROPPED) {
  849                 INP_RUNLOCK(inp);
  850                 return (ECONNRESET);
  851         }
  852         if (inp->inp_socket == NULL) {
  853                 INP_RUNLOCK(inp);
  854                 return (ECONNRESET);
  855         }
  856         tp = intotcpcb(inp);
  857 
  858         /*
  859          * Check administrative controls on ifnet TLS to determine if
  860          * ifnet TLS should be denied.
  861          *
  862          * - Always permit 'force' requests.
  863          * - ktls_ifnet_permitted == 0: always deny.
  864          */
  865         if (!force && ktls_ifnet_permitted == 0) {
  866                 INP_RUNLOCK(inp);
  867                 return (ENXIO);
  868         }
  869 
  870         /*
  871          * XXX: Use the cached route in the inpcb to find the
  872          * interface.  This should perhaps instead use
  873          * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
  874          * enabled after a connection has completed key negotiation in
  875          * userland, the cached route will be present in practice.
  876          */
  877         nh = inp->inp_route.ro_nh;
  878         if (nh == NULL) {
  879                 INP_RUNLOCK(inp);
  880                 return (ENXIO);
  881         }
  882         ifp = nh->nh_ifp;
  883         if_ref(ifp);
  884 
  885         /*
  886          * Allocate a TLS + ratelimit tag if the connection has an
  887          * existing pacing rate.
  888          */
  889         if (tp->t_pacing_rate != -1 &&
  890             (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
  891                 params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
  892                 params.tls_rate_limit.inp = inp;
  893                 params.tls_rate_limit.tls = tls;
  894                 params.tls_rate_limit.max_rate = tp->t_pacing_rate;
  895         } else {
  896                 params.hdr.type = IF_SND_TAG_TYPE_TLS;
  897                 params.tls.inp = inp;
  898                 params.tls.tls = tls;
  899         }
  900         params.hdr.flowid = inp->inp_flowid;
  901         params.hdr.flowtype = inp->inp_flowtype;
  902         params.hdr.numa_domain = inp->inp_numa_domain;
  903         INP_RUNLOCK(inp);
  904 
  905         if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
  906                 error = EOPNOTSUPP;
  907                 goto out;
  908         }
  909         if (inp->inp_vflag & INP_IPV6) {
  910                 if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
  911                         error = EOPNOTSUPP;
  912                         goto out;
  913                 }
  914         } else {
  915                 if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
  916                         error = EOPNOTSUPP;
  917                         goto out;
  918                 }
  919         }
  920         error = m_snd_tag_alloc(ifp, &params, mstp);
  921 out:
  922         if_rele(ifp);
  923         return (error);
  924 }
  925 
  926 /*
  927  * Allocate an initial TLS receive tag for doing HW decryption of TLS
  928  * data.
  929  *
  930  * This function allocates a new TLS receive tag on whatever interface
  931  * the connection is currently routed over.  If the connection ends up
  932  * using a different interface for receive this will get fixed up via
  933  * ktls_input_ifp_mismatch as future packets arrive.
  934  */
  935 static int
  936 ktls_alloc_rcv_tag(struct inpcb *inp, struct ktls_session *tls,
  937     struct m_snd_tag **mstp)
  938 {
  939         union if_snd_tag_alloc_params params;
  940         struct ifnet *ifp;
  941         struct nhop_object *nh;
  942         int error;
  943 
  944         if (!ktls_ocf_recrypt_supported(tls))
  945                 return (ENXIO);
  946 
  947         INP_RLOCK(inp);
  948         if (inp->inp_flags & INP_DROPPED) {
  949                 INP_RUNLOCK(inp);
  950                 return (ECONNRESET);
  951         }
  952         if (inp->inp_socket == NULL) {
  953                 INP_RUNLOCK(inp);
  954                 return (ECONNRESET);
  955         }
  956 
  957         /*
  958          * Check administrative controls on ifnet TLS to determine if
  959          * ifnet TLS should be denied.
  960          */
  961         if (ktls_ifnet_permitted == 0) {
  962                 INP_RUNLOCK(inp);
  963                 return (ENXIO);
  964         }
  965 
  966         /*
  967          * XXX: As with ktls_alloc_snd_tag, use the cached route in
  968          * the inpcb to find the interface.
  969          */
  970         nh = inp->inp_route.ro_nh;
  971         if (nh == NULL) {
  972                 INP_RUNLOCK(inp);
  973                 return (ENXIO);
  974         }
  975         ifp = nh->nh_ifp;
  976         if_ref(ifp);
  977         tls->rx_ifp = ifp;
  978 
  979         params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
  980         params.hdr.flowid = inp->inp_flowid;
  981         params.hdr.flowtype = inp->inp_flowtype;
  982         params.hdr.numa_domain = inp->inp_numa_domain;
  983         params.tls_rx.inp = inp;
  984         params.tls_rx.tls = tls;
  985         params.tls_rx.vlan_id = 0;
  986 
  987         INP_RUNLOCK(inp);
  988 
  989         if (inp->inp_vflag & INP_IPV6) {
  990                 if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0) {
  991                         error = EOPNOTSUPP;
  992                         goto out;
  993                 }
  994         } else {
  995                 if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0) {
  996                         error = EOPNOTSUPP;
  997                         goto out;
  998                 }
  999         }
 1000         error = m_snd_tag_alloc(ifp, &params, mstp);
 1001 
 1002         /*
 1003          * If this connection is over a vlan, vlan_snd_tag_alloc
 1004          * rewrites vlan_id with the saved interface.  Save the VLAN
 1005          * ID for use in ktls_reset_receive_tag which allocates new
 1006          * receive tags directly from the leaf interface bypassing
 1007          * if_vlan.
 1008          */
 1009         if (error == 0)
 1010                 tls->rx_vlan_id = params.tls_rx.vlan_id;
 1011 out:
 1012         return (error);
 1013 }
 1014 
 1015 static int
 1016 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, int direction,
 1017     bool force)
 1018 {
 1019         struct m_snd_tag *mst;
 1020         int error;
 1021 
 1022         switch (direction) {
 1023         case KTLS_TX:
 1024                 error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 1025                 if (__predict_false(error != 0))
 1026                         goto done;
 1027                 break;
 1028         case KTLS_RX:
 1029                 KASSERT(!force, ("%s: forced receive tag", __func__));
 1030                 error = ktls_alloc_rcv_tag(so->so_pcb, tls, &mst);
 1031                 if (__predict_false(error != 0))
 1032                         goto done;
 1033                 break;
 1034         default:
 1035                 __assert_unreachable();
 1036         }
 1037 
 1038         tls->mode = TCP_TLS_MODE_IFNET;
 1039         tls->snd_tag = mst;
 1040 
 1041         switch (tls->params.cipher_algorithm) {
 1042         case CRYPTO_AES_CBC:
 1043                 counter_u64_add(ktls_ifnet_cbc, 1);
 1044                 break;
 1045         case CRYPTO_AES_NIST_GCM_16:
 1046                 counter_u64_add(ktls_ifnet_gcm, 1);
 1047                 break;
 1048         case CRYPTO_CHACHA20_POLY1305:
 1049                 counter_u64_add(ktls_ifnet_chacha20, 1);
 1050                 break;
 1051         default:
 1052                 break;
 1053         }
 1054 done:
 1055         return (error);
 1056 }
 1057 
 1058 static void
 1059 ktls_use_sw(struct ktls_session *tls)
 1060 {
 1061         tls->mode = TCP_TLS_MODE_SW;
 1062         switch (tls->params.cipher_algorithm) {
 1063         case CRYPTO_AES_CBC:
 1064                 counter_u64_add(ktls_sw_cbc, 1);
 1065                 break;
 1066         case CRYPTO_AES_NIST_GCM_16:
 1067                 counter_u64_add(ktls_sw_gcm, 1);
 1068                 break;
 1069         case CRYPTO_CHACHA20_POLY1305:
 1070                 counter_u64_add(ktls_sw_chacha20, 1);
 1071                 break;
 1072         }
 1073 }
 1074 
 1075 static int
 1076 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 1077 {
 1078         int error;
 1079 
 1080         error = ktls_ocf_try(so, tls, direction);
 1081         if (error)
 1082                 return (error);
 1083         ktls_use_sw(tls);
 1084         return (0);
 1085 }
 1086 
 1087 /*
 1088  * KTLS RX stores data in the socket buffer as a list of TLS records,
 1089  * where each record is stored as a control message containg the TLS
 1090  * header followed by data mbufs containing the decrypted data.  This
 1091  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
 1092  * both encrypted and decrypted data.  TLS records decrypted by a NIC
 1093  * should be queued to the socket buffer as records, but encrypted
 1094  * data which needs to be decrypted by software arrives as a stream of
 1095  * regular mbufs which need to be converted.  In addition, there may
 1096  * already be pending encrypted data in the socket buffer when KTLS RX
 1097  * is enabled.
 1098  *
 1099  * To manage not-yet-decrypted data for KTLS RX, the following scheme
 1100  * is used:
 1101  *
 1102  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
 1103  *
 1104  * - ktls_check_rx checks this chain of mbufs reading the TLS header
 1105  *   from the first mbuf.  Once all of the data for that TLS record is
 1106  *   queued, the socket is queued to a worker thread.
 1107  *
 1108  * - The worker thread calls ktls_decrypt to decrypt TLS records in
 1109  *   the TLS chain.  Each TLS record is detached from the TLS chain,
 1110  *   decrypted, and inserted into the regular socket buffer chain as
 1111  *   record starting with a control message holding the TLS header and
 1112  *   a chain of mbufs holding the encrypted data.
 1113  */
 1114 
 1115 static void
 1116 sb_mark_notready(struct sockbuf *sb)
 1117 {
 1118         struct mbuf *m;
 1119 
 1120         m = sb->sb_mb;
 1121         sb->sb_mtls = m;
 1122         sb->sb_mb = NULL;
 1123         sb->sb_mbtail = NULL;
 1124         sb->sb_lastrecord = NULL;
 1125         for (; m != NULL; m = m->m_next) {
 1126                 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 1127                     __func__));
 1128                 KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 1129                     __func__));
 1130                 KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 1131                     __func__));
 1132                 m->m_flags |= M_NOTREADY;
 1133                 sb->sb_acc -= m->m_len;
 1134                 sb->sb_tlscc += m->m_len;
 1135                 sb->sb_mtlstail = m;
 1136         }
 1137         KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 1138             ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 1139             sb->sb_ccc));
 1140 }
 1141 
 1142 /*
 1143  * Return information about the pending TLS data in a socket
 1144  * buffer.  On return, 'seqno' is set to the sequence number
 1145  * of the next TLS record to be received, 'resid' is set to
 1146  * the amount of bytes still needed for the last pending
 1147  * record.  The function returns 'false' if the last pending
 1148  * record contains a partial TLS header.  In that case, 'resid'
 1149  * is the number of bytes needed to complete the TLS header.
 1150  */
 1151 bool
 1152 ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp)
 1153 {
 1154         struct tls_record_layer hdr;
 1155         struct mbuf *m;
 1156         uint64_t seqno;
 1157         size_t resid;
 1158         u_int offset, record_len;
 1159 
 1160         SOCKBUF_LOCK_ASSERT(sb);
 1161         MPASS(sb->sb_flags & SB_TLS_RX);
 1162         seqno = sb->sb_tls_seqno;
 1163         resid = sb->sb_tlscc;
 1164         m = sb->sb_mtls;
 1165         offset = 0;
 1166 
 1167         if (resid == 0) {
 1168                 *seqnop = seqno;
 1169                 *residp = 0;
 1170                 return (true);
 1171         }
 1172 
 1173         for (;;) {
 1174                 seqno++;
 1175 
 1176                 if (resid < sizeof(hdr)) {
 1177                         *seqnop = seqno;
 1178                         *residp = sizeof(hdr) - resid;
 1179                         return (false);
 1180                 }
 1181 
 1182                 m_copydata(m, offset, sizeof(hdr), (void *)&hdr);
 1183 
 1184                 record_len = sizeof(hdr) + ntohs(hdr.tls_length);
 1185                 if (resid <= record_len) {
 1186                         *seqnop = seqno;
 1187                         *residp = record_len - resid;
 1188                         return (true);
 1189                 }
 1190                 resid -= record_len;
 1191 
 1192                 while (record_len != 0) {
 1193                         if (m->m_len - offset > record_len) {
 1194                                 offset += record_len;
 1195                                 break;
 1196                         }
 1197 
 1198                         record_len -= (m->m_len - offset);
 1199                         offset = 0;
 1200                         m = m->m_next;
 1201                 }
 1202         }
 1203 }
 1204 
 1205 int
 1206 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 1207 {
 1208         struct ktls_session *tls;
 1209         int error;
 1210 
 1211         if (!ktls_offload_enable)
 1212                 return (ENOTSUP);
 1213         if (SOLISTENING(so))
 1214                 return (EINVAL);
 1215 
 1216         counter_u64_add(ktls_offload_enable_calls, 1);
 1217 
 1218         /*
 1219          * This should always be true since only the TCP socket option
 1220          * invokes this function.
 1221          */
 1222         if (so->so_proto->pr_protocol != IPPROTO_TCP)
 1223                 return (EINVAL);
 1224 
 1225         /*
 1226          * XXX: Don't overwrite existing sessions.  We should permit
 1227          * this to support rekeying in the future.
 1228          */
 1229         if (so->so_rcv.sb_tls_info != NULL)
 1230                 return (EALREADY);
 1231 
 1232         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 1233                 return (ENOTSUP);
 1234 
 1235         error = ktls_create_session(so, en, &tls, KTLS_RX);
 1236         if (error)
 1237                 return (error);
 1238 
 1239         error = ktls_ocf_try(so, tls, KTLS_RX);
 1240         if (error) {
 1241                 ktls_free(tls);
 1242                 return (error);
 1243         }
 1244 
 1245         /* Mark the socket as using TLS offload. */
 1246         SOCKBUF_LOCK(&so->so_rcv);
 1247         so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 1248         so->so_rcv.sb_tls_info = tls;
 1249         so->so_rcv.sb_flags |= SB_TLS_RX;
 1250 
 1251         /* Mark existing data as not ready until it can be decrypted. */
 1252         sb_mark_notready(&so->so_rcv);
 1253         ktls_check_rx(&so->so_rcv);
 1254         SOCKBUF_UNLOCK(&so->so_rcv);
 1255 
 1256         /* Prefer TOE -> ifnet TLS -> software TLS. */
 1257 #ifdef TCP_OFFLOAD
 1258         error = ktls_try_toe(so, tls, KTLS_RX);
 1259         if (error)
 1260 #endif
 1261                 error = ktls_try_ifnet(so, tls, KTLS_RX, false);
 1262         if (error)
 1263                 ktls_use_sw(tls);
 1264 
 1265         counter_u64_add(ktls_offload_total, 1);
 1266 
 1267         return (0);
 1268 }
 1269 
 1270 int
 1271 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 1272 {
 1273         struct ktls_session *tls;
 1274         struct inpcb *inp;
 1275         int error;
 1276 
 1277         if (!ktls_offload_enable)
 1278                 return (ENOTSUP);
 1279         if (SOLISTENING(so))
 1280                 return (EINVAL);
 1281 
 1282         counter_u64_add(ktls_offload_enable_calls, 1);
 1283 
 1284         /*
 1285          * This should always be true since only the TCP socket option
 1286          * invokes this function.
 1287          */
 1288         if (so->so_proto->pr_protocol != IPPROTO_TCP)
 1289                 return (EINVAL);
 1290 
 1291         /*
 1292          * XXX: Don't overwrite existing sessions.  We should permit
 1293          * this to support rekeying in the future.
 1294          */
 1295         if (so->so_snd.sb_tls_info != NULL)
 1296                 return (EALREADY);
 1297 
 1298         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 1299                 return (ENOTSUP);
 1300 
 1301         /* TLS requires ext pgs */
 1302         if (mb_use_ext_pgs == 0)
 1303                 return (ENXIO);
 1304 
 1305         error = ktls_create_session(so, en, &tls, KTLS_TX);
 1306         if (error)
 1307                 return (error);
 1308 
 1309         /* Prefer TOE -> ifnet TLS -> software TLS. */
 1310 #ifdef TCP_OFFLOAD
 1311         error = ktls_try_toe(so, tls, KTLS_TX);
 1312         if (error)
 1313 #endif
 1314                 error = ktls_try_ifnet(so, tls, KTLS_TX, false);
 1315         if (error)
 1316                 error = ktls_try_sw(so, tls, KTLS_TX);
 1317 
 1318         if (error) {
 1319                 ktls_free(tls);
 1320                 return (error);
 1321         }
 1322 
 1323         error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 1324         if (error) {
 1325                 ktls_free(tls);
 1326                 return (error);
 1327         }
 1328 
 1329         /*
 1330          * Write lock the INP when setting sb_tls_info so that
 1331          * routines in tcp_ratelimit.c can read sb_tls_info while
 1332          * holding the INP lock.
 1333          */
 1334         inp = so->so_pcb;
 1335         INP_WLOCK(inp);
 1336         SOCKBUF_LOCK(&so->so_snd);
 1337         so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 1338         so->so_snd.sb_tls_info = tls;
 1339         if (tls->mode != TCP_TLS_MODE_SW)
 1340                 so->so_snd.sb_flags |= SB_TLS_IFNET;
 1341         SOCKBUF_UNLOCK(&so->so_snd);
 1342         INP_WUNLOCK(inp);
 1343         SOCK_IO_SEND_UNLOCK(so);
 1344 
 1345         counter_u64_add(ktls_offload_total, 1);
 1346 
 1347         return (0);
 1348 }
 1349 
 1350 int
 1351 ktls_get_rx_mode(struct socket *so, int *modep)
 1352 {
 1353         struct ktls_session *tls;
 1354         struct inpcb *inp __diagused;
 1355 
 1356         if (SOLISTENING(so))
 1357                 return (EINVAL);
 1358         inp = so->so_pcb;
 1359         INP_WLOCK_ASSERT(inp);
 1360         SOCK_RECVBUF_LOCK(so);
 1361         tls = so->so_rcv.sb_tls_info;
 1362         if (tls == NULL)
 1363                 *modep = TCP_TLS_MODE_NONE;
 1364         else
 1365                 *modep = tls->mode;
 1366         SOCK_RECVBUF_UNLOCK(so);
 1367         return (0);
 1368 }
 1369 
 1370 /*
 1371  * ktls_get_rx_sequence - get the next TCP- and TLS- sequence number.
 1372  *
 1373  * This function gets information about the next TCP- and TLS-
 1374  * sequence number to be processed by the TLS receive worker
 1375  * thread. The information is extracted from the given "inpcb"
 1376  * structure. The values are stored in host endian format at the two
 1377  * given output pointer locations. The TCP sequence number points to
 1378  * the beginning of the TLS header.
 1379  *
 1380  * This function returns zero on success, else a non-zero error code
 1381  * is returned.
 1382  */
 1383 int
 1384 ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq)
 1385 {
 1386         struct socket *so;
 1387         struct tcpcb *tp;
 1388 
 1389         INP_RLOCK(inp);
 1390         so = inp->inp_socket;
 1391         if (__predict_false(so == NULL)) {
 1392                 INP_RUNLOCK(inp);
 1393                 return (EINVAL);
 1394         }
 1395         if (inp->inp_flags & INP_DROPPED) {
 1396                 INP_RUNLOCK(inp);
 1397                 return (ECONNRESET);
 1398         }
 1399 
 1400         tp = intotcpcb(inp);
 1401         MPASS(tp != NULL);
 1402 
 1403         SOCKBUF_LOCK(&so->so_rcv);
 1404         *tcpseq = tp->rcv_nxt - so->so_rcv.sb_tlscc;
 1405         *tlsseq = so->so_rcv.sb_tls_seqno;
 1406         SOCKBUF_UNLOCK(&so->so_rcv);
 1407 
 1408         INP_RUNLOCK(inp);
 1409 
 1410         return (0);
 1411 }
 1412 
 1413 int
 1414 ktls_get_tx_mode(struct socket *so, int *modep)
 1415 {
 1416         struct ktls_session *tls;
 1417         struct inpcb *inp __diagused;
 1418 
 1419         if (SOLISTENING(so))
 1420                 return (EINVAL);
 1421         inp = so->so_pcb;
 1422         INP_WLOCK_ASSERT(inp);
 1423         SOCK_SENDBUF_LOCK(so);
 1424         tls = so->so_snd.sb_tls_info;
 1425         if (tls == NULL)
 1426                 *modep = TCP_TLS_MODE_NONE;
 1427         else
 1428                 *modep = tls->mode;
 1429         SOCK_SENDBUF_UNLOCK(so);
 1430         return (0);
 1431 }
 1432 
 1433 /*
 1434  * Switch between SW and ifnet TLS sessions as requested.
 1435  */
 1436 int
 1437 ktls_set_tx_mode(struct socket *so, int mode)
 1438 {
 1439         struct ktls_session *tls, *tls_new;
 1440         struct inpcb *inp;
 1441         int error;
 1442 
 1443         if (SOLISTENING(so))
 1444                 return (EINVAL);
 1445         switch (mode) {
 1446         case TCP_TLS_MODE_SW:
 1447         case TCP_TLS_MODE_IFNET:
 1448                 break;
 1449         default:
 1450                 return (EINVAL);
 1451         }
 1452 
 1453         inp = so->so_pcb;
 1454         INP_WLOCK_ASSERT(inp);
 1455         SOCKBUF_LOCK(&so->so_snd);
 1456         tls = so->so_snd.sb_tls_info;
 1457         if (tls == NULL) {
 1458                 SOCKBUF_UNLOCK(&so->so_snd);
 1459                 return (0);
 1460         }
 1461 
 1462         if (tls->mode == mode) {
 1463                 SOCKBUF_UNLOCK(&so->so_snd);
 1464                 return (0);
 1465         }
 1466 
 1467         tls = ktls_hold(tls);
 1468         SOCKBUF_UNLOCK(&so->so_snd);
 1469         INP_WUNLOCK(inp);
 1470 
 1471         tls_new = ktls_clone_session(tls, KTLS_TX);
 1472 
 1473         if (mode == TCP_TLS_MODE_IFNET)
 1474                 error = ktls_try_ifnet(so, tls_new, KTLS_TX, true);
 1475         else
 1476                 error = ktls_try_sw(so, tls_new, KTLS_TX);
 1477         if (error) {
 1478                 counter_u64_add(ktls_switch_failed, 1);
 1479                 ktls_free(tls_new);
 1480                 ktls_free(tls);
 1481                 INP_WLOCK(inp);
 1482                 return (error);
 1483         }
 1484 
 1485         error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 1486         if (error) {
 1487                 counter_u64_add(ktls_switch_failed, 1);
 1488                 ktls_free(tls_new);
 1489                 ktls_free(tls);
 1490                 INP_WLOCK(inp);
 1491                 return (error);
 1492         }
 1493 
 1494         /*
 1495          * If we raced with another session change, keep the existing
 1496          * session.
 1497          */
 1498         if (tls != so->so_snd.sb_tls_info) {
 1499                 counter_u64_add(ktls_switch_failed, 1);
 1500                 SOCK_IO_SEND_UNLOCK(so);
 1501                 ktls_free(tls_new);
 1502                 ktls_free(tls);
 1503                 INP_WLOCK(inp);
 1504                 return (EBUSY);
 1505         }
 1506 
 1507         INP_WLOCK(inp);
 1508         SOCKBUF_LOCK(&so->so_snd);
 1509         so->so_snd.sb_tls_info = tls_new;
 1510         if (tls_new->mode != TCP_TLS_MODE_SW)
 1511                 so->so_snd.sb_flags |= SB_TLS_IFNET;
 1512         SOCKBUF_UNLOCK(&so->so_snd);
 1513         SOCK_IO_SEND_UNLOCK(so);
 1514 
 1515         /*
 1516          * Drop two references on 'tls'.  The first is for the
 1517          * ktls_hold() above.  The second drops the reference from the
 1518          * socket buffer.
 1519          */
 1520         KASSERT(tls->refcount >= 2, ("too few references on old session"));
 1521         ktls_free(tls);
 1522         ktls_free(tls);
 1523 
 1524         if (mode == TCP_TLS_MODE_IFNET)
 1525                 counter_u64_add(ktls_switch_to_ifnet, 1);
 1526         else
 1527                 counter_u64_add(ktls_switch_to_sw, 1);
 1528 
 1529         return (0);
 1530 }
 1531 
 1532 /*
 1533  * Try to allocate a new TLS receive tag.  This task is scheduled when
 1534  * sbappend_ktls_rx detects an input path change.  If a new tag is
 1535  * allocated, replace the tag in the TLS session.  If a new tag cannot
 1536  * be allocated, let the session fall back to software decryption.
 1537  */
 1538 static void
 1539 ktls_reset_receive_tag(void *context, int pending)
 1540 {
 1541         union if_snd_tag_alloc_params params;
 1542         struct ktls_session *tls;
 1543         struct m_snd_tag *mst;
 1544         struct inpcb *inp;
 1545         struct ifnet *ifp;
 1546         struct socket *so;
 1547         int error;
 1548 
 1549         MPASS(pending == 1);
 1550 
 1551         tls = context;
 1552         so = tls->so;
 1553         inp = so->so_pcb;
 1554         ifp = NULL;
 1555 
 1556         INP_RLOCK(inp);
 1557         if (inp->inp_flags & INP_DROPPED) {
 1558                 INP_RUNLOCK(inp);
 1559                 goto out;
 1560         }
 1561 
 1562         SOCKBUF_LOCK(&so->so_rcv);
 1563         mst = tls->snd_tag;
 1564         tls->snd_tag = NULL;
 1565         if (mst != NULL)
 1566                 m_snd_tag_rele(mst);
 1567 
 1568         ifp = tls->rx_ifp;
 1569         if_ref(ifp);
 1570         SOCKBUF_UNLOCK(&so->so_rcv);
 1571 
 1572         params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
 1573         params.hdr.flowid = inp->inp_flowid;
 1574         params.hdr.flowtype = inp->inp_flowtype;
 1575         params.hdr.numa_domain = inp->inp_numa_domain;
 1576         params.tls_rx.inp = inp;
 1577         params.tls_rx.tls = tls;
 1578         params.tls_rx.vlan_id = tls->rx_vlan_id;
 1579         INP_RUNLOCK(inp);
 1580 
 1581         if (inp->inp_vflag & INP_IPV6) {
 1582                 if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0)
 1583                         goto out;
 1584         } else {
 1585                 if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0)
 1586                         goto out;
 1587         }
 1588 
 1589         error = m_snd_tag_alloc(ifp, &params, &mst);
 1590         if (error == 0) {
 1591                 SOCKBUF_LOCK(&so->so_rcv);
 1592                 tls->snd_tag = mst;
 1593                 SOCKBUF_UNLOCK(&so->so_rcv);
 1594 
 1595                 counter_u64_add(ktls_ifnet_reset, 1);
 1596         } else {
 1597                 /*
 1598                  * Just fall back to software decryption if a tag
 1599                  * cannot be allocated leaving the connection intact.
 1600                  * If a future input path change switches to another
 1601                  * interface this connection will resume ifnet TLS.
 1602                  */
 1603                 counter_u64_add(ktls_ifnet_reset_failed, 1);
 1604         }
 1605 
 1606 out:
 1607         mtx_pool_lock(mtxpool_sleep, tls);
 1608         tls->reset_pending = false;
 1609         mtx_pool_unlock(mtxpool_sleep, tls);
 1610 
 1611         if (ifp != NULL)
 1612                 if_rele(ifp);
 1613         sorele(so);
 1614         ktls_free(tls);
 1615 }
 1616 
 1617 /*
 1618  * Try to allocate a new TLS send tag.  This task is scheduled when
 1619  * ip_output detects a route change while trying to transmit a packet
 1620  * holding a TLS record.  If a new tag is allocated, replace the tag
 1621  * in the TLS session.  Subsequent packets on the connection will use
 1622  * the new tag.  If a new tag cannot be allocated, drop the
 1623  * connection.
 1624  */
 1625 static void
 1626 ktls_reset_send_tag(void *context, int pending)
 1627 {
 1628         struct epoch_tracker et;
 1629         struct ktls_session *tls;
 1630         struct m_snd_tag *old, *new;
 1631         struct inpcb *inp;
 1632         struct tcpcb *tp;
 1633         int error;
 1634 
 1635         MPASS(pending == 1);
 1636 
 1637         tls = context;
 1638         inp = tls->inp;
 1639 
 1640         /*
 1641          * Free the old tag first before allocating a new one.
 1642          * ip[6]_output_send() will treat a NULL send tag the same as
 1643          * an ifp mismatch and drop packets until a new tag is
 1644          * allocated.
 1645          *
 1646          * Write-lock the INP when changing tls->snd_tag since
 1647          * ip[6]_output_send() holds a read-lock when reading the
 1648          * pointer.
 1649          */
 1650         INP_WLOCK(inp);
 1651         old = tls->snd_tag;
 1652         tls->snd_tag = NULL;
 1653         INP_WUNLOCK(inp);
 1654         if (old != NULL)
 1655                 m_snd_tag_rele(old);
 1656 
 1657         error = ktls_alloc_snd_tag(inp, tls, true, &new);
 1658 
 1659         if (error == 0) {
 1660                 INP_WLOCK(inp);
 1661                 tls->snd_tag = new;
 1662                 mtx_pool_lock(mtxpool_sleep, tls);
 1663                 tls->reset_pending = false;
 1664                 mtx_pool_unlock(mtxpool_sleep, tls);
 1665                 if (!in_pcbrele_wlocked(inp))
 1666                         INP_WUNLOCK(inp);
 1667 
 1668                 counter_u64_add(ktls_ifnet_reset, 1);
 1669 
 1670                 /*
 1671                  * XXX: Should we kick tcp_output explicitly now that
 1672                  * the send tag is fixed or just rely on timers?
 1673                  */
 1674         } else {
 1675                 NET_EPOCH_ENTER(et);
 1676                 INP_WLOCK(inp);
 1677                 if (!in_pcbrele_wlocked(inp)) {
 1678                         if (!(inp->inp_flags & INP_DROPPED)) {
 1679                                 tp = intotcpcb(inp);
 1680                                 CURVNET_SET(inp->inp_vnet);
 1681                                 tp = tcp_drop(tp, ECONNABORTED);
 1682                                 CURVNET_RESTORE();
 1683                                 if (tp != NULL)
 1684                                         INP_WUNLOCK(inp);
 1685                                 counter_u64_add(ktls_ifnet_reset_dropped, 1);
 1686                         } else
 1687                                 INP_WUNLOCK(inp);
 1688                 }
 1689                 NET_EPOCH_EXIT(et);
 1690 
 1691                 counter_u64_add(ktls_ifnet_reset_failed, 1);
 1692 
 1693                 /*
 1694                  * Leave reset_pending true to avoid future tasks while
 1695                  * the socket goes away.
 1696                  */
 1697         }
 1698 
 1699         ktls_free(tls);
 1700 }
 1701 
 1702 void
 1703 ktls_input_ifp_mismatch(struct sockbuf *sb, struct ifnet *ifp)
 1704 {
 1705         struct ktls_session *tls;
 1706         struct socket *so;
 1707 
 1708         SOCKBUF_LOCK_ASSERT(sb);
 1709         KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 1710             __func__, sb));
 1711         so = __containerof(sb, struct socket, so_rcv);
 1712 
 1713         tls = sb->sb_tls_info;
 1714         if_rele(tls->rx_ifp);
 1715         if_ref(ifp);
 1716         tls->rx_ifp = ifp;
 1717 
 1718         /*
 1719          * See if we should schedule a task to update the receive tag for
 1720          * this session.
 1721          */
 1722         mtx_pool_lock(mtxpool_sleep, tls);
 1723         if (!tls->reset_pending) {
 1724                 (void) ktls_hold(tls);
 1725                 soref(so);
 1726                 tls->so = so;
 1727                 tls->reset_pending = true;
 1728                 taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 1729         }
 1730         mtx_pool_unlock(mtxpool_sleep, tls);
 1731 }
 1732 
 1733 int
 1734 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 1735 {
 1736 
 1737         if (inp == NULL)
 1738                 return (ENOBUFS);
 1739 
 1740         INP_LOCK_ASSERT(inp);
 1741 
 1742         /*
 1743          * See if we should schedule a task to update the send tag for
 1744          * this session.
 1745          */
 1746         mtx_pool_lock(mtxpool_sleep, tls);
 1747         if (!tls->reset_pending) {
 1748                 (void) ktls_hold(tls);
 1749                 in_pcbref(inp);
 1750                 tls->inp = inp;
 1751                 tls->reset_pending = true;
 1752                 taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 1753         }
 1754         mtx_pool_unlock(mtxpool_sleep, tls);
 1755         return (ENOBUFS);
 1756 }
 1757 
 1758 #ifdef RATELIMIT
 1759 int
 1760 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 1761 {
 1762         union if_snd_tag_modify_params params = {
 1763                 .rate_limit.max_rate = max_pacing_rate,
 1764                 .rate_limit.flags = M_NOWAIT,
 1765         };
 1766         struct m_snd_tag *mst;
 1767 
 1768         /* Can't get to the inp, but it should be locked. */
 1769         /* INP_LOCK_ASSERT(inp); */
 1770 
 1771         MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 1772 
 1773         if (tls->snd_tag == NULL) {
 1774                 /*
 1775                  * Resetting send tag, ignore this change.  The
 1776                  * pending reset may or may not see this updated rate
 1777                  * in the tcpcb.  If it doesn't, we will just lose
 1778                  * this rate change.
 1779                  */
 1780                 return (0);
 1781         }
 1782 
 1783         mst = tls->snd_tag;
 1784 
 1785         MPASS(mst != NULL);
 1786         MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
 1787 
 1788         return (mst->sw->snd_tag_modify(mst, &params));
 1789 }
 1790 #endif
 1791 #endif
 1792 
 1793 void
 1794 ktls_destroy(struct ktls_session *tls)
 1795 {
 1796         MPASS(tls->refcount == 0);
 1797 
 1798         if (tls->sequential_records) {
 1799                 struct mbuf *m, *n;
 1800                 int page_count;
 1801 
 1802                 STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) {
 1803                         page_count = m->m_epg_enc_cnt;
 1804                         while (page_count > 0) {
 1805                                 KASSERT(page_count >= m->m_epg_nrdy,
 1806                                     ("%s: too few pages", __func__));
 1807                                 page_count -= m->m_epg_nrdy;
 1808                                 m = m_free(m);
 1809                         }
 1810                 }
 1811         }
 1812 
 1813         counter_u64_add(ktls_offload_active, -1);
 1814         switch (tls->mode) {
 1815         case TCP_TLS_MODE_SW:
 1816                 switch (tls->params.cipher_algorithm) {
 1817                 case CRYPTO_AES_CBC:
 1818                         counter_u64_add(ktls_sw_cbc, -1);
 1819                         break;
 1820                 case CRYPTO_AES_NIST_GCM_16:
 1821                         counter_u64_add(ktls_sw_gcm, -1);
 1822                         break;
 1823                 case CRYPTO_CHACHA20_POLY1305:
 1824                         counter_u64_add(ktls_sw_chacha20, -1);
 1825                         break;
 1826                 }
 1827                 break;
 1828         case TCP_TLS_MODE_IFNET:
 1829                 switch (tls->params.cipher_algorithm) {
 1830                 case CRYPTO_AES_CBC:
 1831                         counter_u64_add(ktls_ifnet_cbc, -1);
 1832                         break;
 1833                 case CRYPTO_AES_NIST_GCM_16:
 1834                         counter_u64_add(ktls_ifnet_gcm, -1);
 1835                         break;
 1836                 case CRYPTO_CHACHA20_POLY1305:
 1837                         counter_u64_add(ktls_ifnet_chacha20, -1);
 1838                         break;
 1839                 }
 1840                 if (tls->snd_tag != NULL)
 1841                         m_snd_tag_rele(tls->snd_tag);
 1842                 if (tls->rx_ifp != NULL)
 1843                         if_rele(tls->rx_ifp);
 1844                 break;
 1845 #ifdef TCP_OFFLOAD
 1846         case TCP_TLS_MODE_TOE:
 1847                 switch (tls->params.cipher_algorithm) {
 1848                 case CRYPTO_AES_CBC:
 1849                         counter_u64_add(ktls_toe_cbc, -1);
 1850                         break;
 1851                 case CRYPTO_AES_NIST_GCM_16:
 1852                         counter_u64_add(ktls_toe_gcm, -1);
 1853                         break;
 1854                 case CRYPTO_CHACHA20_POLY1305:
 1855                         counter_u64_add(ktls_toe_chacha20, -1);
 1856                         break;
 1857                 }
 1858                 break;
 1859 #endif
 1860         }
 1861         if (tls->ocf_session != NULL)
 1862                 ktls_ocf_free(tls);
 1863         if (tls->params.auth_key != NULL) {
 1864                 zfree(tls->params.auth_key, M_KTLS);
 1865                 tls->params.auth_key = NULL;
 1866                 tls->params.auth_key_len = 0;
 1867         }
 1868         if (tls->params.cipher_key != NULL) {
 1869                 zfree(tls->params.cipher_key, M_KTLS);
 1870                 tls->params.cipher_key = NULL;
 1871                 tls->params.cipher_key_len = 0;
 1872         }
 1873         explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 1874 
 1875         uma_zfree(ktls_session_zone, tls);
 1876 }
 1877 
 1878 void
 1879 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 1880 {
 1881 
 1882         for (; m != NULL; m = m->m_next) {
 1883                 KASSERT((m->m_flags & M_EXTPG) != 0,
 1884                     ("ktls_seq: mapped mbuf %p", m));
 1885 
 1886                 m->m_epg_seqno = sb->sb_tls_seqno;
 1887                 sb->sb_tls_seqno++;
 1888         }
 1889 }
 1890 
 1891 /*
 1892  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
 1893  * mbuf in the chain must be an unmapped mbuf.  The payload of the
 1894  * mbuf must be populated with the payload of each TLS record.
 1895  *
 1896  * The record_type argument specifies the TLS record type used when
 1897  * populating the TLS header.
 1898  *
 1899  * The enq_count argument on return is set to the number of pages of
 1900  * payload data for this entire chain that need to be encrypted via SW
 1901  * encryption.  The returned value should be passed to ktls_enqueue
 1902  * when scheduling encryption of this chain of mbufs.  To handle the
 1903  * special case of empty fragments for TLS 1.0 sessions, an empty
 1904  * fragment counts as one page.
 1905  */
 1906 void
 1907 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
 1908     uint8_t record_type)
 1909 {
 1910         struct tls_record_layer *tlshdr;
 1911         struct mbuf *m;
 1912         uint64_t *noncep;
 1913         uint16_t tls_len;
 1914         int maxlen __diagused;
 1915 
 1916         maxlen = tls->params.max_frame_len;
 1917         *enq_cnt = 0;
 1918         for (m = top; m != NULL; m = m->m_next) {
 1919                 /*
 1920                  * All mbufs in the chain should be TLS records whose
 1921                  * payload does not exceed the maximum frame length.
 1922                  *
 1923                  * Empty TLS 1.0 records are permitted when using CBC.
 1924                  */
 1925                 KASSERT(m->m_len <= maxlen && m->m_len >= 0 &&
 1926                     (m->m_len > 0 || ktls_permit_empty_frames(tls)),
 1927                     ("ktls_frame: m %p len %d", m, m->m_len));
 1928 
 1929                 /*
 1930                  * TLS frames require unmapped mbufs to store session
 1931                  * info.
 1932                  */
 1933                 KASSERT((m->m_flags & M_EXTPG) != 0,
 1934                     ("ktls_frame: mapped mbuf %p (top = %p)", m, top));
 1935 
 1936                 tls_len = m->m_len;
 1937 
 1938                 /* Save a reference to the session. */
 1939                 m->m_epg_tls = ktls_hold(tls);
 1940 
 1941                 m->m_epg_hdrlen = tls->params.tls_hlen;
 1942                 m->m_epg_trllen = tls->params.tls_tlen;
 1943                 if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 1944                         int bs, delta;
 1945 
 1946                         /*
 1947                          * AES-CBC pads messages to a multiple of the
 1948                          * block size.  Note that the padding is
 1949                          * applied after the digest and the encryption
 1950                          * is done on the "plaintext || mac || padding".
 1951                          * At least one byte of padding is always
 1952                          * present.
 1953                          *
 1954                          * Compute the final trailer length assuming
 1955                          * at most one block of padding.
 1956                          * tls->params.tls_tlen is the maximum
 1957                          * possible trailer length (padding + digest).
 1958                          * delta holds the number of excess padding
 1959                          * bytes if the maximum were used.  Those
 1960                          * extra bytes are removed.
 1961                          */
 1962                         bs = tls->params.tls_bs;
 1963                         delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 1964                         m->m_epg_trllen -= delta;
 1965                 }
 1966                 m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 1967 
 1968                 /* Populate the TLS header. */
 1969                 tlshdr = (void *)m->m_epg_hdr;
 1970                 tlshdr->tls_vmajor = tls->params.tls_vmajor;
 1971 
 1972                 /*
 1973                  * TLS 1.3 masquarades as TLS 1.2 with a record type
 1974                  * of TLS_RLTYPE_APP.
 1975                  */
 1976                 if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 1977                     tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 1978                         tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 1979                         tlshdr->tls_type = TLS_RLTYPE_APP;
 1980                         /* save the real record type for later */
 1981                         m->m_epg_record_type = record_type;
 1982                         m->m_epg_trail[0] = record_type;
 1983                 } else {
 1984                         tlshdr->tls_vminor = tls->params.tls_vminor;
 1985                         tlshdr->tls_type = record_type;
 1986                 }
 1987                 tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 1988 
 1989                 /*
 1990                  * Store nonces / explicit IVs after the end of the
 1991                  * TLS header.
 1992                  *
 1993                  * For GCM with TLS 1.2, an 8 byte nonce is copied
 1994                  * from the end of the IV.  The nonce is then
 1995                  * incremented for use by the next record.
 1996                  *
 1997                  * For CBC, a random nonce is inserted for TLS 1.1+.
 1998                  */
 1999                 if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 2000                     tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 2001                         noncep = (uint64_t *)(tls->params.iv + 8);
 2002                         be64enc(tlshdr + 1, *noncep);
 2003                         (*noncep)++;
 2004                 } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 2005                     tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 2006                         arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 2007 
 2008                 /*
 2009                  * When using SW encryption, mark the mbuf not ready.
 2010                  * It will be marked ready via sbready() after the
 2011                  * record has been encrypted.
 2012                  *
 2013                  * When using ifnet TLS, unencrypted TLS records are
 2014                  * sent down the stack to the NIC.
 2015                  */
 2016                 if (tls->mode == TCP_TLS_MODE_SW) {
 2017                         m->m_flags |= M_NOTREADY;
 2018                         if (__predict_false(tls_len == 0)) {
 2019                                 /* TLS 1.0 empty fragment. */
 2020                                 m->m_epg_nrdy = 1;
 2021                         } else
 2022                                 m->m_epg_nrdy = m->m_epg_npgs;
 2023                         *enq_cnt += m->m_epg_nrdy;
 2024                 }
 2025         }
 2026 }
 2027 
 2028 bool
 2029 ktls_permit_empty_frames(struct ktls_session *tls)
 2030 {
 2031         return (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 2032             tls->params.tls_vminor == TLS_MINOR_VER_ZERO);
 2033 }
 2034 
 2035 void
 2036 ktls_check_rx(struct sockbuf *sb)
 2037 {
 2038         struct tls_record_layer hdr;
 2039         struct ktls_wq *wq;
 2040         struct socket *so;
 2041         bool running;
 2042 
 2043         SOCKBUF_LOCK_ASSERT(sb);
 2044         KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 2045             __func__, sb));
 2046         so = __containerof(sb, struct socket, so_rcv);
 2047 
 2048         if (sb->sb_flags & SB_TLS_RX_RUNNING)
 2049                 return;
 2050 
 2051         /* Is there enough queued for a TLS header? */
 2052         if (sb->sb_tlscc < sizeof(hdr)) {
 2053                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 2054                         so->so_error = EMSGSIZE;
 2055                 return;
 2056         }
 2057 
 2058         m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 2059 
 2060         /* Is the entire record queued? */
 2061         if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 2062                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 2063                         so->so_error = EMSGSIZE;
 2064                 return;
 2065         }
 2066 
 2067         sb->sb_flags |= SB_TLS_RX_RUNNING;
 2068 
 2069         soref(so);
 2070         wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 2071         mtx_lock(&wq->mtx);
 2072         STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 2073         running = wq->running;
 2074         mtx_unlock(&wq->mtx);
 2075         if (!running)
 2076                 wakeup(wq);
 2077         counter_u64_add(ktls_cnt_rx_queued, 1);
 2078 }
 2079 
 2080 static struct mbuf *
 2081 ktls_detach_record(struct sockbuf *sb, int len)
 2082 {
 2083         struct mbuf *m, *n, *top;
 2084         int remain;
 2085 
 2086         SOCKBUF_LOCK_ASSERT(sb);
 2087         MPASS(len <= sb->sb_tlscc);
 2088 
 2089         /*
 2090          * If TLS chain is the exact size of the record,
 2091          * just grab the whole record.
 2092          */
 2093         top = sb->sb_mtls;
 2094         if (sb->sb_tlscc == len) {
 2095                 sb->sb_mtls = NULL;
 2096                 sb->sb_mtlstail = NULL;
 2097                 goto out;
 2098         }
 2099 
 2100         /*
 2101          * While it would be nice to use m_split() here, we need
 2102          * to know exactly what m_split() allocates to update the
 2103          * accounting, so do it inline instead.
 2104          */
 2105         remain = len;
 2106         for (m = top; remain > m->m_len; m = m->m_next)
 2107                 remain -= m->m_len;
 2108 
 2109         /* Easy case: don't have to split 'm'. */
 2110         if (remain == m->m_len) {
 2111                 sb->sb_mtls = m->m_next;
 2112                 if (sb->sb_mtls == NULL)
 2113                         sb->sb_mtlstail = NULL;
 2114                 m->m_next = NULL;
 2115                 goto out;
 2116         }
 2117 
 2118         /*
 2119          * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 2120          * with M_NOWAIT first.
 2121          */
 2122         n = m_get(M_NOWAIT, MT_DATA);
 2123         if (n == NULL) {
 2124                 /*
 2125                  * Use M_WAITOK with socket buffer unlocked.  If
 2126                  * 'sb_mtls' changes while the lock is dropped, return
 2127                  * NULL to force the caller to retry.
 2128                  */
 2129                 SOCKBUF_UNLOCK(sb);
 2130 
 2131                 n = m_get(M_WAITOK, MT_DATA);
 2132 
 2133                 SOCKBUF_LOCK(sb);
 2134                 if (sb->sb_mtls != top) {
 2135                         m_free(n);
 2136                         return (NULL);
 2137                 }
 2138         }
 2139         n->m_flags |= (m->m_flags & (M_NOTREADY | M_DECRYPTED));
 2140 
 2141         /* Store remainder in 'n'. */
 2142         n->m_len = m->m_len - remain;
 2143         if (m->m_flags & M_EXT) {
 2144                 n->m_data = m->m_data + remain;
 2145                 mb_dupcl(n, m);
 2146         } else {
 2147                 bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 2148         }
 2149 
 2150         /* Trim 'm' and update accounting. */
 2151         m->m_len -= n->m_len;
 2152         sb->sb_tlscc -= n->m_len;
 2153         sb->sb_ccc -= n->m_len;
 2154 
 2155         /* Account for 'n'. */
 2156         sballoc_ktls_rx(sb, n);
 2157 
 2158         /* Insert 'n' into the TLS chain. */
 2159         sb->sb_mtls = n;
 2160         n->m_next = m->m_next;
 2161         if (sb->sb_mtlstail == m)
 2162                 sb->sb_mtlstail = n;
 2163 
 2164         /* Detach the record from the TLS chain. */
 2165         m->m_next = NULL;
 2166 
 2167 out:
 2168         MPASS(m_length(top, NULL) == len);
 2169         for (m = top; m != NULL; m = m->m_next)
 2170                 sbfree_ktls_rx(sb, m);
 2171         sb->sb_tlsdcc = len;
 2172         sb->sb_ccc += len;
 2173         SBCHECK(sb);
 2174         return (top);
 2175 }
 2176 
 2177 /*
 2178  * Determine the length of the trailing zero padding and find the real
 2179  * record type in the byte before the padding.
 2180  *
 2181  * Walking the mbuf chain backwards is clumsy, so another option would
 2182  * be to scan forwards remembering the last non-zero byte before the
 2183  * trailer.  However, it would be expensive to scan the entire record.
 2184  * Instead, find the last non-zero byte of each mbuf in the chain
 2185  * keeping track of the relative offset of that nonzero byte.
 2186  *
 2187  * trail_len is the size of the MAC/tag on input and is set to the
 2188  * size of the full trailer including padding and the record type on
 2189  * return.
 2190  */
 2191 static int
 2192 tls13_find_record_type(struct ktls_session *tls, struct mbuf *m, int tls_len,
 2193     int *trailer_len, uint8_t *record_typep)
 2194 {
 2195         char *cp;
 2196         u_int digest_start, last_offset, m_len, offset;
 2197         uint8_t record_type;
 2198 
 2199         digest_start = tls_len - *trailer_len;
 2200         last_offset = 0;
 2201         offset = 0;
 2202         for (; m != NULL && offset < digest_start;
 2203              offset += m->m_len, m = m->m_next) {
 2204                 /* Don't look for padding in the tag. */
 2205                 m_len = min(digest_start - offset, m->m_len);
 2206                 cp = mtod(m, char *);
 2207 
 2208                 /* Find last non-zero byte in this mbuf. */
 2209                 while (m_len > 0 && cp[m_len - 1] == 0)
 2210                         m_len--;
 2211                 if (m_len > 0) {
 2212                         record_type = cp[m_len - 1];
 2213                         last_offset = offset + m_len;
 2214                 }
 2215         }
 2216         if (last_offset < tls->params.tls_hlen)
 2217                 return (EBADMSG);
 2218 
 2219         *record_typep = record_type;
 2220         *trailer_len = tls_len - last_offset + 1;
 2221         return (0);
 2222 }
 2223 
 2224 /*
 2225  * Check if a mbuf chain is fully decrypted at the given offset and
 2226  * length. Returns KTLS_MBUF_CRYPTO_ST_DECRYPTED if all data is
 2227  * decrypted. KTLS_MBUF_CRYPTO_ST_MIXED if there is a mix of encrypted
 2228  * and decrypted data. Else KTLS_MBUF_CRYPTO_ST_ENCRYPTED if all data
 2229  * is encrypted.
 2230  */
 2231 ktls_mbuf_crypto_st_t
 2232 ktls_mbuf_crypto_state(struct mbuf *mb, int offset, int len)
 2233 {
 2234         int m_flags_ored = 0;
 2235         int m_flags_anded = -1;
 2236 
 2237         for (; mb != NULL; mb = mb->m_next) {
 2238                 if (offset < mb->m_len)
 2239                         break;
 2240                 offset -= mb->m_len;
 2241         }
 2242         offset += len;
 2243 
 2244         for (; mb != NULL; mb = mb->m_next) {
 2245                 m_flags_ored |= mb->m_flags;
 2246                 m_flags_anded &= mb->m_flags;
 2247 
 2248                 if (offset <= mb->m_len)
 2249                         break;
 2250                 offset -= mb->m_len;
 2251         }
 2252         MPASS(mb != NULL || offset == 0);
 2253 
 2254         if ((m_flags_ored ^ m_flags_anded) & M_DECRYPTED)
 2255                 return (KTLS_MBUF_CRYPTO_ST_MIXED);
 2256         else
 2257                 return ((m_flags_ored & M_DECRYPTED) ?
 2258                     KTLS_MBUF_CRYPTO_ST_DECRYPTED :
 2259                     KTLS_MBUF_CRYPTO_ST_ENCRYPTED);
 2260 }
 2261 
 2262 /*
 2263  * ktls_resync_ifnet - get HW TLS RX back on track after packet loss
 2264  */
 2265 static int
 2266 ktls_resync_ifnet(struct socket *so, uint32_t tls_len, uint64_t tls_rcd_num)
 2267 {
 2268         union if_snd_tag_modify_params params;
 2269         struct m_snd_tag *mst;
 2270         struct inpcb *inp;
 2271         struct tcpcb *tp;
 2272 
 2273         mst = so->so_rcv.sb_tls_info->snd_tag;
 2274         if (__predict_false(mst == NULL))
 2275                 return (EINVAL);
 2276 
 2277         inp = sotoinpcb(so);
 2278         if (__predict_false(inp == NULL))
 2279                 return (EINVAL);
 2280 
 2281         INP_RLOCK(inp);
 2282         if (inp->inp_flags & INP_DROPPED) {
 2283                 INP_RUNLOCK(inp);
 2284                 return (ECONNRESET);
 2285         }
 2286 
 2287         tp = intotcpcb(inp);
 2288         MPASS(tp != NULL);
 2289 
 2290         /* Get the TCP sequence number of the next valid TLS header. */
 2291         SOCKBUF_LOCK(&so->so_rcv);
 2292         params.tls_rx.tls_hdr_tcp_sn =
 2293             tp->rcv_nxt - so->so_rcv.sb_tlscc - tls_len;
 2294         params.tls_rx.tls_rec_length = tls_len;
 2295         params.tls_rx.tls_seq_number = tls_rcd_num;
 2296         SOCKBUF_UNLOCK(&so->so_rcv);
 2297 
 2298         INP_RUNLOCK(inp);
 2299 
 2300         MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RX);
 2301         return (mst->sw->snd_tag_modify(mst, &params));
 2302 }
 2303 
 2304 static void
 2305 ktls_drop(struct socket *so, int error)
 2306 {
 2307         struct epoch_tracker et;
 2308         struct inpcb *inp = sotoinpcb(so);
 2309         struct tcpcb *tp;
 2310 
 2311         NET_EPOCH_ENTER(et);
 2312         INP_WLOCK(inp);
 2313         if (!(inp->inp_flags & INP_DROPPED)) {
 2314                 tp = intotcpcb(inp);
 2315                 CURVNET_SET(inp->inp_vnet);
 2316                 tp = tcp_drop(tp, error);
 2317                 CURVNET_RESTORE();
 2318                 if (tp != NULL)
 2319                         INP_WUNLOCK(inp);
 2320         } else {
 2321                 so->so_error = error;
 2322                 SOCK_RECVBUF_LOCK(so);
 2323                 sorwakeup_locked(so);
 2324                 INP_WUNLOCK(inp);
 2325         }
 2326         NET_EPOCH_EXIT(et);
 2327 }
 2328 
 2329 static void
 2330 ktls_decrypt(struct socket *so)
 2331 {
 2332         char tls_header[MBUF_PEXT_HDR_LEN];
 2333         struct ktls_session *tls;
 2334         struct sockbuf *sb;
 2335         struct tls_record_layer *hdr;
 2336         struct tls_get_record tgr;
 2337         struct mbuf *control, *data, *m;
 2338         ktls_mbuf_crypto_st_t state;
 2339         uint64_t seqno;
 2340         int error, remain, tls_len, trail_len;
 2341         bool tls13;
 2342         uint8_t vminor, record_type;
 2343 
 2344         hdr = (struct tls_record_layer *)tls_header;
 2345         sb = &so->so_rcv;
 2346         SOCKBUF_LOCK(sb);
 2347         KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 2348             ("%s: socket %p not running", __func__, so));
 2349 
 2350         tls = sb->sb_tls_info;
 2351         MPASS(tls != NULL);
 2352 
 2353         tls13 = (tls->params.tls_vminor == TLS_MINOR_VER_THREE);
 2354         if (tls13)
 2355                 vminor = TLS_MINOR_VER_TWO;
 2356         else
 2357                 vminor = tls->params.tls_vminor;
 2358         for (;;) {
 2359                 /* Is there enough queued for a TLS header? */
 2360                 if (sb->sb_tlscc < tls->params.tls_hlen)
 2361                         break;
 2362 
 2363                 m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 2364                 tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 2365 
 2366                 if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 2367                     hdr->tls_vminor != vminor)
 2368                         error = EINVAL;
 2369                 else if (tls13 && hdr->tls_type != TLS_RLTYPE_APP)
 2370                         error = EINVAL;
 2371                 else if (tls_len < tls->params.tls_hlen || tls_len >
 2372                     tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 2373                     tls->params.tls_tlen)
 2374                         error = EMSGSIZE;
 2375                 else
 2376                         error = 0;
 2377                 if (__predict_false(error != 0)) {
 2378                         /*
 2379                          * We have a corrupted record and are likely
 2380                          * out of sync.  The connection isn't
 2381                          * recoverable at this point, so abort it.
 2382                          */
 2383                         SOCKBUF_UNLOCK(sb);
 2384                         counter_u64_add(ktls_offload_corrupted_records, 1);
 2385 
 2386                         ktls_drop(so, error);
 2387                         goto deref;
 2388                 }
 2389 
 2390                 /* Is the entire record queued? */
 2391                 if (sb->sb_tlscc < tls_len)
 2392                         break;
 2393 
 2394                 /*
 2395                  * Split out the portion of the mbuf chain containing
 2396                  * this TLS record.
 2397                  */
 2398                 data = ktls_detach_record(sb, tls_len);
 2399                 if (data == NULL)
 2400                         continue;
 2401                 MPASS(sb->sb_tlsdcc == tls_len);
 2402 
 2403                 seqno = sb->sb_tls_seqno;
 2404                 sb->sb_tls_seqno++;
 2405                 SBCHECK(sb);
 2406                 SOCKBUF_UNLOCK(sb);
 2407 
 2408                 /* get crypto state for this TLS record */
 2409                 state = ktls_mbuf_crypto_state(data, 0, tls_len);
 2410 
 2411                 switch (state) {
 2412                 case KTLS_MBUF_CRYPTO_ST_MIXED:
 2413                         error = ktls_ocf_recrypt(tls, hdr, data, seqno);
 2414                         if (error)
 2415                                 break;
 2416                         /* FALLTHROUGH */
 2417                 case KTLS_MBUF_CRYPTO_ST_ENCRYPTED:
 2418                         error = ktls_ocf_decrypt(tls, hdr, data, seqno,
 2419                             &trail_len);
 2420                         if (__predict_true(error == 0)) {
 2421                                 if (tls13) {
 2422                                         error = tls13_find_record_type(tls, data,
 2423                                             tls_len, &trail_len, &record_type);
 2424                                 } else {
 2425                                         record_type = hdr->tls_type;
 2426                                 }
 2427                         }
 2428                         break;
 2429                 case KTLS_MBUF_CRYPTO_ST_DECRYPTED:
 2430                         /*
 2431                          * NIC TLS is only supported for AEAD
 2432                          * ciphersuites which used a fixed sized
 2433                          * trailer.
 2434                          */
 2435                         if (tls13) {
 2436                                 trail_len = tls->params.tls_tlen - 1;
 2437                                 error = tls13_find_record_type(tls, data,
 2438                                     tls_len, &trail_len, &record_type);
 2439                         } else {
 2440                                 trail_len = tls->params.tls_tlen;
 2441                                 error = 0;
 2442                                 record_type = hdr->tls_type;
 2443                         }
 2444                         break;
 2445                 default:
 2446                         error = EINVAL;
 2447                         break;
 2448                 }
 2449                 if (error) {
 2450                         counter_u64_add(ktls_offload_failed_crypto, 1);
 2451 
 2452                         SOCKBUF_LOCK(sb);
 2453                         if (sb->sb_tlsdcc == 0) {
 2454                                 /*
 2455                                  * sbcut/drop/flush discarded these
 2456                                  * mbufs.
 2457                                  */
 2458                                 m_freem(data);
 2459                                 break;
 2460                         }
 2461 
 2462                         /*
 2463                          * Drop this TLS record's data, but keep
 2464                          * decrypting subsequent records.
 2465                          */
 2466                         sb->sb_ccc -= tls_len;
 2467                         sb->sb_tlsdcc = 0;
 2468 
 2469                         if (error != EMSGSIZE)
 2470                                 error = EBADMSG;
 2471                         CURVNET_SET(so->so_vnet);
 2472                         so->so_error = error;
 2473                         sorwakeup_locked(so);
 2474                         CURVNET_RESTORE();
 2475 
 2476                         m_freem(data);
 2477 
 2478                         SOCKBUF_LOCK(sb);
 2479                         continue;
 2480                 }
 2481 
 2482                 /* Allocate the control mbuf. */
 2483                 memset(&tgr, 0, sizeof(tgr));
 2484                 tgr.tls_type = record_type;
 2485                 tgr.tls_vmajor = hdr->tls_vmajor;
 2486                 tgr.tls_vminor = hdr->tls_vminor;
 2487                 tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 2488                     trail_len);
 2489                 control = sbcreatecontrol(&tgr, sizeof(tgr),
 2490                     TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 2491 
 2492                 SOCKBUF_LOCK(sb);
 2493                 if (sb->sb_tlsdcc == 0) {
 2494                         /* sbcut/drop/flush discarded these mbufs. */
 2495                         MPASS(sb->sb_tlscc == 0);
 2496                         m_freem(data);
 2497                         m_freem(control);
 2498                         break;
 2499                 }
 2500 
 2501                 /*
 2502                  * Clear the 'dcc' accounting in preparation for
 2503                  * adding the decrypted record.
 2504                  */
 2505                 sb->sb_ccc -= tls_len;
 2506                 sb->sb_tlsdcc = 0;
 2507                 SBCHECK(sb);
 2508 
 2509                 /* If there is no payload, drop all of the data. */
 2510                 if (tgr.tls_length == htobe16(0)) {
 2511                         m_freem(data);
 2512                         data = NULL;
 2513                 } else {
 2514                         /* Trim header. */
 2515                         remain = tls->params.tls_hlen;
 2516                         while (remain > 0) {
 2517                                 if (data->m_len > remain) {
 2518                                         data->m_data += remain;
 2519                                         data->m_len -= remain;
 2520                                         break;
 2521                                 }
 2522                                 remain -= data->m_len;
 2523                                 data = m_free(data);
 2524                         }
 2525 
 2526                         /* Trim trailer and clear M_NOTREADY. */
 2527                         remain = be16toh(tgr.tls_length);
 2528                         m = data;
 2529                         for (m = data; remain > m->m_len; m = m->m_next) {
 2530                                 m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 2531                                 remain -= m->m_len;
 2532                         }
 2533                         m->m_len = remain;
 2534                         m_freem(m->m_next);
 2535                         m->m_next = NULL;
 2536                         m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 2537 
 2538                         /* Set EOR on the final mbuf. */
 2539                         m->m_flags |= M_EOR;
 2540                 }
 2541 
 2542                 sbappendcontrol_locked(sb, data, control, 0);
 2543 
 2544                 if (__predict_false(state != KTLS_MBUF_CRYPTO_ST_DECRYPTED)) {
 2545                         sb->sb_flags |= SB_TLS_RX_RESYNC;
 2546                         SOCKBUF_UNLOCK(sb);
 2547                         ktls_resync_ifnet(so, tls_len, seqno);
 2548                         SOCKBUF_LOCK(sb);
 2549                 } else if (__predict_false(sb->sb_flags & SB_TLS_RX_RESYNC)) {
 2550                         sb->sb_flags &= ~SB_TLS_RX_RESYNC;
 2551                         SOCKBUF_UNLOCK(sb);
 2552                         ktls_resync_ifnet(so, 0, seqno);
 2553                         SOCKBUF_LOCK(sb);
 2554                 }
 2555         }
 2556 
 2557         sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 2558 
 2559         if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 2560                 so->so_error = EMSGSIZE;
 2561 
 2562         sorwakeup_locked(so);
 2563 
 2564 deref:
 2565         SOCKBUF_UNLOCK_ASSERT(sb);
 2566 
 2567         CURVNET_SET(so->so_vnet);
 2568         sorele(so);
 2569         CURVNET_RESTORE();
 2570 }
 2571 
 2572 void
 2573 ktls_enqueue_to_free(struct mbuf *m)
 2574 {
 2575         struct ktls_wq *wq;
 2576         bool running;
 2577 
 2578         /* Mark it for freeing. */
 2579         m->m_epg_flags |= EPG_FLAG_2FREE;
 2580         wq = &ktls_wq[m->m_epg_tls->wq_index];
 2581         mtx_lock(&wq->mtx);
 2582         STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 2583         running = wq->running;
 2584         mtx_unlock(&wq->mtx);
 2585         if (!running)
 2586                 wakeup(wq);
 2587 }
 2588 
 2589 static void *
 2590 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 2591 {
 2592         void *buf;
 2593         int domain, running;
 2594 
 2595         if (m->m_epg_npgs <= 2)
 2596                 return (NULL);
 2597         if (ktls_buffer_zone == NULL)
 2598                 return (NULL);
 2599         if ((u_int)(ticks - wq->lastallocfail) < hz) {
 2600                 /*
 2601                  * Rate-limit allocation attempts after a failure.
 2602                  * ktls_buffer_import() will acquire a per-domain mutex to check
 2603                  * the free page queues and may fail consistently if memory is
 2604                  * fragmented.
 2605                  */
 2606                 return (NULL);
 2607         }
 2608         buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
 2609         if (buf == NULL) {
 2610                 domain = PCPU_GET(domain);
 2611                 wq->lastallocfail = ticks;
 2612 
 2613                 /*
 2614                  * Note that this check is "racy", but the races are
 2615                  * harmless, and are either a spurious wakeup if
 2616                  * multiple threads fail allocations before the alloc
 2617                  * thread wakes, or waiting an extra second in case we
 2618                  * see an old value of running == true.
 2619                  */
 2620                 if (!VM_DOMAIN_EMPTY(domain)) {
 2621                         running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
 2622                         if (!running)
 2623                                 wakeup(&ktls_domains[domain].alloc_td);
 2624                 }
 2625         }
 2626         return (buf);
 2627 }
 2628 
 2629 static int
 2630 ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m,
 2631     struct ktls_session *tls, struct ktls_ocf_encrypt_state *state)
 2632 {
 2633         vm_page_t pg;
 2634         int error, i, len, off;
 2635 
 2636         KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY),
 2637             ("%p not unready & nomap mbuf\n", m));
 2638         KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
 2639             ("page count %d larger than maximum frame length %d", m->m_epg_npgs,
 2640             ktls_maxlen));
 2641 
 2642         /* Anonymous mbufs are encrypted in place. */
 2643         if ((m->m_epg_flags & EPG_FLAG_ANON) != 0)
 2644                 return (ktls_ocf_encrypt(state, tls, m, NULL, 0));
 2645 
 2646         /*
 2647          * For file-backed mbufs (from sendfile), anonymous wired
 2648          * pages are allocated and used as the encryption destination.
 2649          */
 2650         if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
 2651                 len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
 2652                     m->m_epg_1st_off;
 2653                 state->dst_iov[0].iov_base = (char *)state->cbuf +
 2654                     m->m_epg_1st_off;
 2655                 state->dst_iov[0].iov_len = len;
 2656                 state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf);
 2657                 i = 1;
 2658         } else {
 2659                 off = m->m_epg_1st_off;
 2660                 for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 2661                         pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 2662                             VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 2663                         len = m_epg_pagelen(m, i, off);
 2664                         state->parray[i] = VM_PAGE_TO_PHYS(pg);
 2665                         state->dst_iov[i].iov_base =
 2666                             (char *)PHYS_TO_DMAP(state->parray[i]) + off;
 2667                         state->dst_iov[i].iov_len = len;
 2668                 }
 2669         }
 2670         KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small"));
 2671         state->dst_iov[i].iov_base = m->m_epg_trail;
 2672         state->dst_iov[i].iov_len = m->m_epg_trllen;
 2673 
 2674         error = ktls_ocf_encrypt(state, tls, m, state->dst_iov, i + 1);
 2675 
 2676         if (__predict_false(error != 0)) {
 2677                 /* Free the anonymous pages. */
 2678                 if (state->cbuf != NULL)
 2679                         uma_zfree(ktls_buffer_zone, state->cbuf);
 2680                 else {
 2681                         for (i = 0; i < m->m_epg_npgs; i++) {
 2682                                 pg = PHYS_TO_VM_PAGE(state->parray[i]);
 2683                                 (void)vm_page_unwire_noq(pg);
 2684                                 vm_page_free(pg);
 2685                         }
 2686                 }
 2687         }
 2688         return (error);
 2689 }
 2690 
 2691 /* Number of TLS records in a batch passed to ktls_enqueue(). */
 2692 static u_int
 2693 ktls_batched_records(struct mbuf *m)
 2694 {
 2695         int page_count, records;
 2696 
 2697         records = 0;
 2698         page_count = m->m_epg_enc_cnt;
 2699         while (page_count > 0) {
 2700                 records++;
 2701                 page_count -= m->m_epg_nrdy;
 2702                 m = m->m_next;
 2703         }
 2704         KASSERT(page_count == 0, ("%s: mismatched page count", __func__));
 2705         return (records);
 2706 }
 2707 
 2708 void
 2709 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 2710 {
 2711         struct ktls_session *tls;
 2712         struct ktls_wq *wq;
 2713         int queued;
 2714         bool running;
 2715 
 2716         KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 2717             (M_EXTPG | M_NOTREADY)),
 2718             ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 2719         KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 2720 
 2721         KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 2722 
 2723         m->m_epg_enc_cnt = page_count;
 2724 
 2725         /*
 2726          * Save a pointer to the socket.  The caller is responsible
 2727          * for taking an additional reference via soref().
 2728          */
 2729         m->m_epg_so = so;
 2730 
 2731         queued = 1;
 2732         tls = m->m_epg_tls;
 2733         wq = &ktls_wq[tls->wq_index];
 2734         mtx_lock(&wq->mtx);
 2735         if (__predict_false(tls->sequential_records)) {
 2736                 /*
 2737                  * For TLS 1.0, records must be encrypted
 2738                  * sequentially.  For a given connection, all records
 2739                  * queued to the associated work queue are processed
 2740                  * sequentially.  However, sendfile(2) might complete
 2741                  * I/O requests spanning multiple TLS records out of
 2742                  * order.  Here we ensure TLS records are enqueued to
 2743                  * the work queue in FIFO order.
 2744                  *
 2745                  * tls->next_seqno holds the sequence number of the
 2746                  * next TLS record that should be enqueued to the work
 2747                  * queue.  If this next record is not tls->next_seqno,
 2748                  * it must be a future record, so insert it, sorted by
 2749                  * TLS sequence number, into tls->pending_records and
 2750                  * return.
 2751                  *
 2752                  * If this TLS record matches tls->next_seqno, place
 2753                  * it in the work queue and then check
 2754                  * tls->pending_records to see if any
 2755                  * previously-queued records are now ready for
 2756                  * encryption.
 2757                  */
 2758                 if (m->m_epg_seqno != tls->next_seqno) {
 2759                         struct mbuf *n, *p;
 2760 
 2761                         p = NULL;
 2762                         STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) {
 2763                                 if (n->m_epg_seqno > m->m_epg_seqno)
 2764                                         break;
 2765                                 p = n;
 2766                         }
 2767                         if (n == NULL)
 2768                                 STAILQ_INSERT_TAIL(&tls->pending_records, m,
 2769                                     m_epg_stailq);
 2770                         else if (p == NULL)
 2771                                 STAILQ_INSERT_HEAD(&tls->pending_records, m,
 2772                                     m_epg_stailq);
 2773                         else
 2774                                 STAILQ_INSERT_AFTER(&tls->pending_records, p, m,
 2775                                     m_epg_stailq);
 2776                         mtx_unlock(&wq->mtx);
 2777                         counter_u64_add(ktls_cnt_tx_pending, 1);
 2778                         return;
 2779                 }
 2780 
 2781                 tls->next_seqno += ktls_batched_records(m);
 2782                 STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 2783 
 2784                 while (!STAILQ_EMPTY(&tls->pending_records)) {
 2785                         struct mbuf *n;
 2786 
 2787                         n = STAILQ_FIRST(&tls->pending_records);
 2788                         if (n->m_epg_seqno != tls->next_seqno)
 2789                                 break;
 2790 
 2791                         queued++;
 2792                         STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq);
 2793                         tls->next_seqno += ktls_batched_records(n);
 2794                         STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq);
 2795                 }
 2796                 counter_u64_add(ktls_cnt_tx_pending, -(queued - 1));
 2797         } else
 2798                 STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 2799 
 2800         running = wq->running;
 2801         mtx_unlock(&wq->mtx);
 2802         if (!running)
 2803                 wakeup(wq);
 2804         counter_u64_add(ktls_cnt_tx_queued, queued);
 2805 }
 2806 
 2807 /*
 2808  * Once a file-backed mbuf (from sendfile) has been encrypted, free
 2809  * the pages from the file and replace them with the anonymous pages
 2810  * allocated in ktls_encrypt_record().
 2811  */
 2812 static void
 2813 ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state)
 2814 {
 2815         int i;
 2816 
 2817         MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0);
 2818 
 2819         /* Free the old pages. */
 2820         m->m_ext.ext_free(m);
 2821 
 2822         /* Replace them with the new pages. */
 2823         if (state->cbuf != NULL) {
 2824                 for (i = 0; i < m->m_epg_npgs; i++)
 2825                         m->m_epg_pa[i] = state->parray[0] + ptoa(i);
 2826 
 2827                 /* Contig pages should go back to the cache. */
 2828                 m->m_ext.ext_free = ktls_free_mext_contig;
 2829         } else {
 2830                 for (i = 0; i < m->m_epg_npgs; i++)
 2831                         m->m_epg_pa[i] = state->parray[i];
 2832 
 2833                 /* Use the basic free routine. */
 2834                 m->m_ext.ext_free = mb_free_mext_pgs;
 2835         }
 2836 
 2837         /* Pages are now writable. */
 2838         m->m_epg_flags |= EPG_FLAG_ANON;
 2839 }
 2840 
 2841 static __noinline void
 2842 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
 2843 {
 2844         struct ktls_ocf_encrypt_state state;
 2845         struct ktls_session *tls;
 2846         struct socket *so;
 2847         struct mbuf *m;
 2848         int error, npages, total_pages;
 2849 
 2850         so = top->m_epg_so;
 2851         tls = top->m_epg_tls;
 2852         KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 2853         KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 2854 #ifdef INVARIANTS
 2855         top->m_epg_so = NULL;
 2856 #endif
 2857         total_pages = top->m_epg_enc_cnt;
 2858         npages = 0;
 2859 
 2860         /*
 2861          * Encrypt the TLS records in the chain of mbufs starting with
 2862          * 'top'.  'total_pages' gives us a total count of pages and is
 2863          * used to know when we have finished encrypting the TLS
 2864          * records originally queued with 'top'.
 2865          *
 2866          * NB: These mbufs are queued in the socket buffer and
 2867          * 'm_next' is traversing the mbufs in the socket buffer.  The
 2868          * socket buffer lock is not held while traversing this chain.
 2869          * Since the mbufs are all marked M_NOTREADY their 'm_next'
 2870          * pointers should be stable.  However, the 'm_next' of the
 2871          * last mbuf encrypted is not necessarily NULL.  It can point
 2872          * to other mbufs appended while 'top' was on the TLS work
 2873          * queue.
 2874          *
 2875          * Each mbuf holds an entire TLS record.
 2876          */
 2877         error = 0;
 2878         for (m = top; npages != total_pages; m = m->m_next) {
 2879                 KASSERT(m->m_epg_tls == tls,
 2880                     ("different TLS sessions in a single mbuf chain: %p vs %p",
 2881                     tls, m->m_epg_tls));
 2882                 KASSERT(npages + m->m_epg_npgs <= total_pages,
 2883                     ("page count mismatch: top %p, total_pages %d, m %p", top,
 2884                     total_pages, m));
 2885 
 2886                 error = ktls_encrypt_record(wq, m, tls, &state);
 2887                 if (error) {
 2888                         counter_u64_add(ktls_offload_failed_crypto, 1);
 2889                         break;
 2890                 }
 2891 
 2892                 if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 2893                         ktls_finish_nonanon(m, &state);
 2894 
 2895                 npages += m->m_epg_nrdy;
 2896 
 2897                 /*
 2898                  * Drop a reference to the session now that it is no
 2899                  * longer needed.  Existing code depends on encrypted
 2900                  * records having no associated session vs
 2901                  * yet-to-be-encrypted records having an associated
 2902                  * session.
 2903                  */
 2904                 m->m_epg_tls = NULL;
 2905                 ktls_free(tls);
 2906         }
 2907 
 2908         CURVNET_SET(so->so_vnet);
 2909         if (error == 0) {
 2910                 (void)so->so_proto->pr_ready(so, top, npages);
 2911         } else {
 2912                 ktls_drop(so, EIO);
 2913                 mb_free_notready(top, total_pages);
 2914         }
 2915 
 2916         sorele(so);
 2917         CURVNET_RESTORE();
 2918 }
 2919 
 2920 void
 2921 ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error)
 2922 {
 2923         struct ktls_session *tls;
 2924         struct socket *so;
 2925         struct mbuf *m;
 2926         int npages;
 2927 
 2928         m = state->m;
 2929 
 2930         if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 2931                 ktls_finish_nonanon(m, state);
 2932 
 2933         so = state->so;
 2934         free(state, M_KTLS);
 2935 
 2936         /*
 2937          * Drop a reference to the session now that it is no longer
 2938          * needed.  Existing code depends on encrypted records having
 2939          * no associated session vs yet-to-be-encrypted records having
 2940          * an associated session.
 2941          */
 2942         tls = m->m_epg_tls;
 2943         m->m_epg_tls = NULL;
 2944         ktls_free(tls);
 2945 
 2946         if (error != 0)
 2947                 counter_u64_add(ktls_offload_failed_crypto, 1);
 2948 
 2949         CURVNET_SET(so->so_vnet);
 2950         npages = m->m_epg_nrdy;
 2951 
 2952         if (error == 0) {
 2953                 (void)so->so_proto->pr_ready(so, m, npages);
 2954         } else {
 2955                 ktls_drop(so, EIO);
 2956                 mb_free_notready(m, npages);
 2957         }
 2958 
 2959         sorele(so);
 2960         CURVNET_RESTORE();
 2961 }
 2962 
 2963 /*
 2964  * Similar to ktls_encrypt, but used with asynchronous OCF backends
 2965  * (coprocessors) where encryption does not use host CPU resources and
 2966  * it can be beneficial to queue more requests than CPUs.
 2967  */
 2968 static __noinline void
 2969 ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 2970 {
 2971         struct ktls_ocf_encrypt_state *state;
 2972         struct ktls_session *tls;
 2973         struct socket *so;
 2974         struct mbuf *m, *n;
 2975         int error, mpages, npages, total_pages;
 2976 
 2977         so = top->m_epg_so;
 2978         tls = top->m_epg_tls;
 2979         KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 2980         KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 2981 #ifdef INVARIANTS
 2982         top->m_epg_so = NULL;
 2983 #endif
 2984         total_pages = top->m_epg_enc_cnt;
 2985         npages = 0;
 2986 
 2987         error = 0;
 2988         for (m = top; npages != total_pages; m = n) {
 2989                 KASSERT(m->m_epg_tls == tls,
 2990                     ("different TLS sessions in a single mbuf chain: %p vs %p",
 2991                     tls, m->m_epg_tls));
 2992                 KASSERT(npages + m->m_epg_npgs <= total_pages,
 2993                     ("page count mismatch: top %p, total_pages %d, m %p", top,
 2994                     total_pages, m));
 2995 
 2996                 state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO);
 2997                 soref(so);
 2998                 state->so = so;
 2999                 state->m = m;
 3000 
 3001                 mpages = m->m_epg_nrdy;
 3002                 n = m->m_next;
 3003 
 3004                 error = ktls_encrypt_record(wq, m, tls, state);
 3005                 if (error) {
 3006                         counter_u64_add(ktls_offload_failed_crypto, 1);
 3007                         free(state, M_KTLS);
 3008                         CURVNET_SET(so->so_vnet);
 3009                         sorele(so);
 3010                         CURVNET_RESTORE();
 3011                         break;
 3012                 }
 3013 
 3014                 npages += mpages;
 3015         }
 3016 
 3017         CURVNET_SET(so->so_vnet);
 3018         if (error != 0) {
 3019                 ktls_drop(so, EIO);
 3020                 mb_free_notready(m, total_pages - npages);
 3021         }
 3022 
 3023         sorele(so);
 3024         CURVNET_RESTORE();
 3025 }
 3026 
 3027 static int
 3028 ktls_bind_domain(int domain)
 3029 {
 3030         int error;
 3031 
 3032         error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]);
 3033         if (error != 0)
 3034                 return (error);
 3035         curthread->td_domain.dr_policy = DOMAINSET_PREF(domain);
 3036         return (0);
 3037 }
 3038 
 3039 static void
 3040 ktls_alloc_thread(void *ctx)
 3041 {
 3042         struct ktls_domain_info *ktls_domain = ctx;
 3043         struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
 3044         void **buf;
 3045         struct sysctl_oid *oid;
 3046         char name[80];
 3047         int domain, error, i, nbufs;
 3048 
 3049         domain = ktls_domain - ktls_domains;
 3050         if (bootverbose)
 3051                 printf("Starting KTLS alloc thread for domain %d\n", domain);
 3052         error = ktls_bind_domain(domain);
 3053         if (error)
 3054                 printf("Unable to bind KTLS alloc thread for domain %d: error %d\n",
 3055                     domain, error);
 3056         snprintf(name, sizeof(name), "domain%d", domain);
 3057         oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 3058             name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 3059         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
 3060             CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
 3061         SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
 3062             CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
 3063         SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
 3064             CTLFLAG_RD,  &sc->running, 0, "thread running");
 3065 
 3066         buf = NULL;
 3067         nbufs = 0;
 3068         for (;;) {
 3069                 atomic_store_int(&sc->running, 0);
 3070                 tsleep(sc, PZERO | PNOLOCK, "-",  0);
 3071                 atomic_store_int(&sc->running, 1);
 3072                 sc->wakeups++;
 3073                 if (nbufs != ktls_max_alloc) {
 3074                         free(buf, M_KTLS);
 3075                         nbufs = atomic_load_int(&ktls_max_alloc);
 3076                         buf = malloc(sizeof(void *) * nbufs, M_KTLS,
 3077                             M_WAITOK | M_ZERO);
 3078                 }
 3079                 /*
 3080                  * Below we allocate nbufs with different allocation
 3081                  * flags than we use when allocating normally during
 3082                  * encryption in the ktls worker thread.  We specify
 3083                  * M_NORECLAIM in the worker thread. However, we omit
 3084                  * that flag here and add M_WAITOK so that the VM
 3085                  * system is permitted to perform expensive work to
 3086                  * defragment memory.  We do this here, as it does not
 3087                  * matter if this thread blocks.  If we block a ktls
 3088                  * worker thread, we risk developing backlogs of
 3089                  * buffers to be encrypted, leading to surges of
 3090                  * traffic and potential NIC output drops.
 3091                  */
 3092                 for (i = 0; i < nbufs; i++) {
 3093                         buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
 3094                         sc->allocs++;
 3095                 }
 3096                 for (i = 0; i < nbufs; i++) {
 3097                         uma_zfree(ktls_buffer_zone, buf[i]);
 3098                         buf[i] = NULL;
 3099                 }
 3100         }
 3101 }
 3102 
 3103 static void
 3104 ktls_work_thread(void *ctx)
 3105 {
 3106         struct ktls_wq *wq = ctx;
 3107         struct mbuf *m, *n;
 3108         struct socket *so, *son;
 3109         STAILQ_HEAD(, mbuf) local_m_head;
 3110         STAILQ_HEAD(, socket) local_so_head;
 3111         int cpu;
 3112 
 3113         cpu = wq - ktls_wq;
 3114         if (bootverbose)
 3115                 printf("Starting KTLS worker thread for CPU %d\n", cpu);
 3116 
 3117         /*
 3118          * Bind to a core.  If ktls_bind_threads is > 1, then
 3119          * we bind to the NUMA domain instead.
 3120          */
 3121         if (ktls_bind_threads) {
 3122                 int error;
 3123 
 3124                 if (ktls_bind_threads > 1) {
 3125                         struct pcpu *pc = pcpu_find(cpu);
 3126 
 3127                         error = ktls_bind_domain(pc->pc_domain);
 3128                 } else {
 3129                         cpuset_t mask;
 3130 
 3131                         CPU_SETOF(cpu, &mask);
 3132                         error = cpuset_setthread(curthread->td_tid, &mask);
 3133                 }
 3134                 if (error)
 3135                         printf("Unable to bind KTLS worker thread for CPU %d: error %d\n",
 3136                                 cpu, error);
 3137         }
 3138 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 3139         fpu_kern_thread(0);
 3140 #endif
 3141         for (;;) {
 3142                 mtx_lock(&wq->mtx);
 3143                 while (STAILQ_EMPTY(&wq->m_head) &&
 3144                     STAILQ_EMPTY(&wq->so_head)) {
 3145                         wq->running = false;
 3146                         mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 3147                         wq->running = true;
 3148                 }
 3149 
 3150                 STAILQ_INIT(&local_m_head);
 3151                 STAILQ_CONCAT(&local_m_head, &wq->m_head);
 3152                 STAILQ_INIT(&local_so_head);
 3153                 STAILQ_CONCAT(&local_so_head, &wq->so_head);
 3154                 mtx_unlock(&wq->mtx);
 3155 
 3156                 STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 3157                         if (m->m_epg_flags & EPG_FLAG_2FREE) {
 3158                                 ktls_free(m->m_epg_tls);
 3159                                 m_free_raw(m);
 3160                         } else {
 3161                                 if (m->m_epg_tls->sync_dispatch)
 3162                                         ktls_encrypt(wq, m);
 3163                                 else
 3164                                         ktls_encrypt_async(wq, m);
 3165                                 counter_u64_add(ktls_cnt_tx_queued, -1);
 3166                         }
 3167                 }
 3168 
 3169                 STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 3170                         ktls_decrypt(so);
 3171                         counter_u64_add(ktls_cnt_rx_queued, -1);
 3172                 }
 3173         }
 3174 }
 3175 
 3176 #if defined(INET) || defined(INET6)
 3177 static void
 3178 ktls_disable_ifnet_help(void *context, int pending __unused)
 3179 {
 3180         struct ktls_session *tls;
 3181         struct inpcb *inp;
 3182         struct tcpcb *tp;
 3183         struct socket *so;
 3184         int err;
 3185 
 3186         tls = context;
 3187         inp = tls->inp;
 3188         if (inp == NULL)
 3189                 return;
 3190         INP_WLOCK(inp);
 3191         so = inp->inp_socket;
 3192         MPASS(so != NULL);
 3193         if (inp->inp_flags & INP_DROPPED) {
 3194                 goto out;
 3195         }
 3196 
 3197         if (so->so_snd.sb_tls_info != NULL)
 3198                 err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
 3199         else
 3200                 err = ENXIO;
 3201         if (err == 0) {
 3202                 counter_u64_add(ktls_ifnet_disable_ok, 1);
 3203                 /* ktls_set_tx_mode() drops inp wlock, so recheck flags */
 3204                 if ((inp->inp_flags & INP_DROPPED) == 0 &&
 3205                     (tp = intotcpcb(inp)) != NULL &&
 3206                     tp->t_fb->tfb_hwtls_change != NULL)
 3207                         (*tp->t_fb->tfb_hwtls_change)(tp, 0);
 3208         } else {
 3209                 counter_u64_add(ktls_ifnet_disable_fail, 1);
 3210         }
 3211 
 3212 out:
 3213         CURVNET_SET(so->so_vnet);
 3214         sorele(so);
 3215         CURVNET_RESTORE();
 3216         if (!in_pcbrele_wlocked(inp))
 3217                 INP_WUNLOCK(inp);
 3218         ktls_free(tls);
 3219 }
 3220 
 3221 /*
 3222  * Called when re-transmits are becoming a substantial portion of the
 3223  * sends on this connection.  When this happens, we transition the
 3224  * connection to software TLS.  This is needed because most inline TLS
 3225  * NICs keep crypto state only for in-order transmits.  This means
 3226  * that to handle a TCP rexmit (which is out-of-order), the NIC must
 3227  * re-DMA the entire TLS record up to and including the current
 3228  * segment.  This means that when re-transmitting the last ~1448 byte
 3229  * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
 3230  * of magnitude more data than we are sending.  This can cause the
 3231  * PCIe link to saturate well before the network, which can cause
 3232  * output drops, and a general loss of capacity.
 3233  */
 3234 void
 3235 ktls_disable_ifnet(void *arg)
 3236 {
 3237         struct tcpcb *tp;
 3238         struct inpcb *inp;
 3239         struct socket *so;
 3240         struct ktls_session *tls;
 3241 
 3242         tp = arg;
 3243         inp = tptoinpcb(tp);
 3244         INP_WLOCK_ASSERT(inp);
 3245         so = inp->inp_socket;
 3246         SOCK_LOCK(so);
 3247         tls = so->so_snd.sb_tls_info;
 3248         if (tls->disable_ifnet_pending) {
 3249                 SOCK_UNLOCK(so);
 3250                 return;
 3251         }
 3252 
 3253         /*
 3254          * note that disable_ifnet_pending is never cleared; disabling
 3255          * ifnet can only be done once per session, so we never want
 3256          * to do it again
 3257          */
 3258 
 3259         (void)ktls_hold(tls);
 3260         in_pcbref(inp);
 3261         soref(so);
 3262         tls->disable_ifnet_pending = true;
 3263         tls->inp = inp;
 3264         SOCK_UNLOCK(so);
 3265         TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 3266         (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
 3267 }
 3268 #endif

Cache object: 2a55c81a3770c8ba196ff34fb9fd48fc


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.