kern_mbuf.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2004, 2005,
    5  *      Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice unmodified, this list of conditions and the following
   12  *    disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD$");
   32 
   33 #include "opt_param.h"
   34 #include "opt_kern_tls.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/conf.h>
   38 #include <sys/domainset.h>
   39 #include <sys/malloc.h>
   40 #include <sys/systm.h>
   41 #include <sys/mbuf.h>
   42 #include <sys/domain.h>
   43 #include <sys/eventhandler.h>
   44 #include <sys/kernel.h>
   45 #include <sys/ktls.h>
   46 #include <sys/limits.h>
   47 #include <sys/lock.h>
   48 #include <sys/mutex.h>
   49 #include <sys/protosw.h>
   50 #include <sys/refcount.h>
   51 #include <sys/sf_buf.h>
   52 #include <sys/smp.h>
   53 #include <sys/socket.h>
   54 #include <sys/sysctl.h>
   55 
   56 #include <net/if.h>
   57 #include <net/if_var.h>
   58 
   59 #include <vm/vm.h>
   60 #include <vm/vm_extern.h>
   61 #include <vm/vm_kern.h>
   62 #include <vm/vm_page.h>
   63 #include <vm/vm_pageout.h>
   64 #include <vm/vm_map.h>
   65 #include <vm/uma.h>
   66 #include <vm/uma_dbg.h>
   67 
   68 /*
   69  * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
   70  * Zones.
   71  *
   72  * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
   73  * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
   74  * administrator so desires.
   75  *
   76  * Mbufs are allocated from a UMA Primary Zone called the Mbuf
   77  * Zone.
   78  *
   79  * Additionally, FreeBSD provides a Packet Zone, which it
   80  * configures as a Secondary Zone to the Mbuf Primary Zone,
   81  * thus sharing backend Slab kegs with the Mbuf Primary Zone.
   82  *
   83  * Thus common-case allocations and locking are simplified:
   84  *
   85  *  m_clget()                m_getcl()
   86  *    |                         |
   87  *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
   88  *    |   |             [     Packet   ]            |
   89  *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
   90  *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Primary Zone ]
   91  *        |                       \________         |
   92  *  [ Cluster Keg   ]                      \       /
   93  *        |                              [ Mbuf Keg   ]
   94  *  [ Cluster Slabs ]                         |
   95  *        |                              [ Mbuf Slabs ]
   96  *         \____________(VM)_________________/
   97  *
   98  *
   99  * Whenever an object is allocated with uma_zalloc() out of
  100  * one of the Zones its _ctor_ function is executed.  The same
  101  * for any deallocation through uma_zfree() the _dtor_ function
  102  * is executed.
  103  *
  104  * Caches are per-CPU and are filled from the Primary Zone.
  105  *
  106  * Whenever an object is allocated from the underlying global
  107  * memory pool it gets pre-initialized with the _zinit_ functions.
  108  * When the Keg's are overfull objects get decommissioned with
  109  * _zfini_ functions and free'd back to the global memory pool.
  110  *
  111  */
  112 
  113 int nmbufs;                     /* limits number of mbufs */
  114 int nmbclusters;                /* limits number of mbuf clusters */
  115 int nmbjumbop;                  /* limits number of page size jumbo clusters */
  116 int nmbjumbo9;                  /* limits number of 9k jumbo clusters */
  117 int nmbjumbo16;                 /* limits number of 16k jumbo clusters */
  118 
  119 bool mb_use_ext_pgs = false;    /* use M_EXTPG mbufs for sendfile & TLS */
  120 
  121 static int
  122 sysctl_mb_use_ext_pgs(SYSCTL_HANDLER_ARGS)
  123 {
  124         int error, extpg;
  125 
  126         extpg = mb_use_ext_pgs;
  127         error = sysctl_handle_int(oidp, &extpg, 0, req);
  128         if (error == 0 && req->newptr != NULL) {
  129                 if (extpg != 0 && !PMAP_HAS_DMAP)
  130                         error = EOPNOTSUPP;
  131                 else
  132                         mb_use_ext_pgs = extpg != 0;
  133         }
  134         return (error);
  135 }
  136 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_use_ext_pgs, CTLTYPE_INT | CTLFLAG_RW,
  137     &mb_use_ext_pgs, 0,
  138     sysctl_mb_use_ext_pgs, "IU",
  139     "Use unmapped mbufs for sendfile(2) and TLS offload");
  140 
  141 static quad_t maxmbufmem;       /* overall real memory limit for all mbufs */
  142 
  143 SYSCTL_QUAD(_kern_ipc, OID_AUTO, maxmbufmem, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &maxmbufmem, 0,
  144     "Maximum real memory allocatable to various mbuf types");
  145 
  146 static counter_u64_t snd_tag_count;
  147 SYSCTL_COUNTER_U64(_kern_ipc, OID_AUTO, num_snd_tags, CTLFLAG_RW,
  148     &snd_tag_count, "# of active mbuf send tags");
  149 
  150 /*
  151  * tunable_mbinit() has to be run before any mbuf allocations are done.
  152  */
  153 static void
  154 tunable_mbinit(void *dummy)
  155 {
  156         quad_t realmem;
  157         int extpg;
  158 
  159         /*
  160          * The default limit for all mbuf related memory is 1/2 of all
  161          * available kernel memory (physical or kmem).
  162          * At most it can be 3/4 of available kernel memory.
  163          */
  164         realmem = qmin((quad_t)physmem * PAGE_SIZE, vm_kmem_size);
  165         maxmbufmem = realmem / 2;
  166         TUNABLE_QUAD_FETCH("kern.ipc.maxmbufmem", &maxmbufmem);
  167         if (maxmbufmem > realmem / 4 * 3)
  168                 maxmbufmem = realmem / 4 * 3;
  169 
  170         TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
  171         if (nmbclusters == 0)
  172                 nmbclusters = maxmbufmem / MCLBYTES / 4;
  173 
  174         TUNABLE_INT_FETCH("kern.ipc.nmbjumbop", &nmbjumbop);
  175         if (nmbjumbop == 0)
  176                 nmbjumbop = maxmbufmem / MJUMPAGESIZE / 4;
  177 
  178         TUNABLE_INT_FETCH("kern.ipc.nmbjumbo9", &nmbjumbo9);
  179         if (nmbjumbo9 == 0)
  180                 nmbjumbo9 = maxmbufmem / MJUM9BYTES / 6;
  181 
  182         TUNABLE_INT_FETCH("kern.ipc.nmbjumbo16", &nmbjumbo16);
  183         if (nmbjumbo16 == 0)
  184                 nmbjumbo16 = maxmbufmem / MJUM16BYTES / 6;
  185 
  186         /*
  187          * We need at least as many mbufs as we have clusters of
  188          * the various types added together.
  189          */
  190         TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
  191         if (nmbufs < nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16)
  192                 nmbufs = lmax(maxmbufmem / MSIZE / 5,
  193                     nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16);
  194 
  195         /*
  196          * Unmapped mbufs can only safely be used on platforms with a direct
  197          * map.
  198          */
  199         if (PMAP_HAS_DMAP) {
  200                 extpg = mb_use_ext_pgs;
  201                 TUNABLE_INT_FETCH("kern.ipc.mb_use_ext_pgs", &extpg);
  202                 mb_use_ext_pgs = extpg != 0;
  203         }
  204 }
  205 SYSINIT(tunable_mbinit, SI_SUB_KMEM, SI_ORDER_MIDDLE, tunable_mbinit, NULL);
  206 
  207 static int
  208 sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
  209 {
  210         int error, newnmbclusters;
  211 
  212         newnmbclusters = nmbclusters;
  213         error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
  214         if (error == 0 && req->newptr && newnmbclusters != nmbclusters) {
  215                 if (newnmbclusters > nmbclusters &&
  216                     nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
  217                         nmbclusters = newnmbclusters;
  218                         nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
  219                         EVENTHANDLER_INVOKE(nmbclusters_change);
  220                 } else
  221                         error = EINVAL;
  222         }
  223         return (error);
  224 }
  225 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters,
  226     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbclusters, 0,
  227     sysctl_nmbclusters, "IU",
  228     "Maximum number of mbuf clusters allowed");
  229 
  230 static int
  231 sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
  232 {
  233         int error, newnmbjumbop;
  234 
  235         newnmbjumbop = nmbjumbop;
  236         error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req);
  237         if (error == 0 && req->newptr && newnmbjumbop != nmbjumbop) {
  238                 if (newnmbjumbop > nmbjumbop &&
  239                     nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
  240                         nmbjumbop = newnmbjumbop;
  241                         nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
  242                 } else
  243                         error = EINVAL;
  244         }
  245         return (error);
  246 }
  247 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop,
  248     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbop, 0,
  249     sysctl_nmbjumbop, "IU",
  250     "Maximum number of mbuf page size jumbo clusters allowed");
  251 
  252 static int
  253 sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
  254 {
  255         int error, newnmbjumbo9;
  256 
  257         newnmbjumbo9 = nmbjumbo9;
  258         error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req);
  259         if (error == 0 && req->newptr && newnmbjumbo9 != nmbjumbo9) {
  260                 if (newnmbjumbo9 > nmbjumbo9 &&
  261                     nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
  262                         nmbjumbo9 = newnmbjumbo9;
  263                         nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
  264                 } else
  265                         error = EINVAL;
  266         }
  267         return (error);
  268 }
  269 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9,
  270     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo9, 0,
  271     sysctl_nmbjumbo9, "IU",
  272     "Maximum number of mbuf 9k jumbo clusters allowed");
  273 
  274 static int
  275 sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
  276 {
  277         int error, newnmbjumbo16;
  278 
  279         newnmbjumbo16 = nmbjumbo16;
  280         error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req);
  281         if (error == 0 && req->newptr && newnmbjumbo16 != nmbjumbo16) {
  282                 if (newnmbjumbo16 > nmbjumbo16 &&
  283                     nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
  284                         nmbjumbo16 = newnmbjumbo16;
  285                         nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
  286                 } else
  287                         error = EINVAL;
  288         }
  289         return (error);
  290 }
  291 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16,
  292     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &nmbjumbo16, 0,
  293     sysctl_nmbjumbo16, "IU",
  294     "Maximum number of mbuf 16k jumbo clusters allowed");
  295 
  296 static int
  297 sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
  298 {
  299         int error, newnmbufs;
  300 
  301         newnmbufs = nmbufs;
  302         error = sysctl_handle_int(oidp, &newnmbufs, 0, req);
  303         if (error == 0 && req->newptr && newnmbufs != nmbufs) {
  304                 if (newnmbufs > nmbufs) {
  305                         nmbufs = newnmbufs;
  306                         nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
  307                         EVENTHANDLER_INVOKE(nmbufs_change);
  308                 } else
  309                         error = EINVAL;
  310         }
  311         return (error);
  312 }
  313 SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbufs,
  314     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  315     &nmbufs, 0, sysctl_nmbufs, "IU",
  316     "Maximum number of mbufs allowed");
  317 
  318 /*
  319  * Zones from which we allocate.
  320  */
  321 uma_zone_t      zone_mbuf;
  322 uma_zone_t      zone_clust;
  323 uma_zone_t      zone_pack;
  324 uma_zone_t      zone_jumbop;
  325 uma_zone_t      zone_jumbo9;
  326 uma_zone_t      zone_jumbo16;
  327 
  328 /*
  329  * Local prototypes.
  330  */
  331 static int      mb_ctor_mbuf(void *, int, void *, int);
  332 static int      mb_ctor_clust(void *, int, void *, int);
  333 static int      mb_ctor_pack(void *, int, void *, int);
  334 static void     mb_dtor_mbuf(void *, int, void *);
  335 static void     mb_dtor_pack(void *, int, void *);
  336 static int      mb_zinit_pack(void *, int, int);
  337 static void     mb_zfini_pack(void *, int);
  338 static void     mb_reclaim(uma_zone_t, int);
  339 
  340 /* Ensure that MSIZE is a power of 2. */
  341 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
  342 
  343 _Static_assert(sizeof(struct mbuf) <= MSIZE,
  344     "size of mbuf exceeds MSIZE");
  345 /*
  346  * Initialize FreeBSD Network buffer allocation.
  347  */
  348 static void
  349 mbuf_init(void *dummy)
  350 {
  351 
  352         /*
  353          * Configure UMA zones for Mbufs, Clusters, and Packets.
  354          */
  355         zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
  356             mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
  357             MSIZE - 1, UMA_ZONE_CONTIG | UMA_ZONE_MAXBUCKET);
  358         if (nmbufs > 0)
  359                 nmbufs = uma_zone_set_max(zone_mbuf, nmbufs);
  360         uma_zone_set_warning(zone_mbuf, "kern.ipc.nmbufs limit reached");
  361         uma_zone_set_maxaction(zone_mbuf, mb_reclaim);
  362 
  363         zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
  364             mb_ctor_clust, NULL, NULL, NULL,
  365             UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
  366         if (nmbclusters > 0)
  367                 nmbclusters = uma_zone_set_max(zone_clust, nmbclusters);
  368         uma_zone_set_warning(zone_clust, "kern.ipc.nmbclusters limit reached");
  369         uma_zone_set_maxaction(zone_clust, mb_reclaim);
  370 
  371         zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
  372             mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
  373 
  374         /* Make jumbo frame zone too. Page size, 9k and 16k. */
  375         zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
  376             mb_ctor_clust, NULL, NULL, NULL,
  377             UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
  378         if (nmbjumbop > 0)
  379                 nmbjumbop = uma_zone_set_max(zone_jumbop, nmbjumbop);
  380         uma_zone_set_warning(zone_jumbop, "kern.ipc.nmbjumbop limit reached");
  381         uma_zone_set_maxaction(zone_jumbop, mb_reclaim);
  382 
  383         zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
  384             mb_ctor_clust, NULL, NULL, NULL,
  385             UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
  386         if (nmbjumbo9 > 0)
  387                 nmbjumbo9 = uma_zone_set_max(zone_jumbo9, nmbjumbo9);
  388         uma_zone_set_warning(zone_jumbo9, "kern.ipc.nmbjumbo9 limit reached");
  389         uma_zone_set_maxaction(zone_jumbo9, mb_reclaim);
  390 
  391         zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
  392             mb_ctor_clust, NULL, NULL, NULL,
  393             UMA_ALIGN_PTR, UMA_ZONE_CONTIG);
  394         if (nmbjumbo16 > 0)
  395                 nmbjumbo16 = uma_zone_set_max(zone_jumbo16, nmbjumbo16);
  396         uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
  397         uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
  398 
  399         /*
  400          * Hook event handler for low-memory situation, used to
  401          * drain protocols and push data back to the caches (UMA
  402          * later pushes it back to VM).
  403          */
  404         EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
  405             EVENTHANDLER_PRI_FIRST);
  406 
  407         snd_tag_count = counter_u64_alloc(M_WAITOK);
  408 }
  409 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
  410 
  411 #ifdef DEBUGNET
  412 /*
  413  * debugnet makes use of a pre-allocated pool of mbufs and clusters.  When
  414  * debugnet is configured, we initialize a set of UMA cache zones which return
  415  * items from this pool.  At panic-time, the regular UMA zone pointers are
  416  * overwritten with those of the cache zones so that drivers may allocate and
  417  * free mbufs and clusters without attempting to allocate physical memory.
  418  *
  419  * We keep mbufs and clusters in a pair of mbuf queues.  In particular, for
  420  * the purpose of caching clusters, we treat them as mbufs.
  421  */
  422 static struct mbufq dn_mbufq =
  423     { STAILQ_HEAD_INITIALIZER(dn_mbufq.mq_head), 0, INT_MAX };
  424 static struct mbufq dn_clustq =
  425     { STAILQ_HEAD_INITIALIZER(dn_clustq.mq_head), 0, INT_MAX };
  426 
  427 static int dn_clsize;
  428 static uma_zone_t dn_zone_mbuf;
  429 static uma_zone_t dn_zone_clust;
  430 static uma_zone_t dn_zone_pack;
  431 
  432 static struct debugnet_saved_zones {
  433         uma_zone_t dsz_mbuf;
  434         uma_zone_t dsz_clust;
  435         uma_zone_t dsz_pack;
  436         uma_zone_t dsz_jumbop;
  437         uma_zone_t dsz_jumbo9;
  438         uma_zone_t dsz_jumbo16;
  439         bool dsz_debugnet_zones_enabled;
  440 } dn_saved_zones;
  441 
  442 static int
  443 dn_buf_import(void *arg, void **store, int count, int domain __unused,
  444     int flags)
  445 {
  446         struct mbufq *q;
  447         struct mbuf *m;
  448         int i;
  449 
  450         q = arg;
  451 
  452         for (i = 0; i < count; i++) {
  453                 m = mbufq_dequeue(q);
  454                 if (m == NULL)
  455                         break;
  456                 trash_init(m, q == &dn_mbufq ? MSIZE : dn_clsize, flags);
  457                 store[i] = m;
  458         }
  459         KASSERT((flags & M_WAITOK) == 0 || i == count,
  460             ("%s: ran out of pre-allocated mbufs", __func__));
  461         return (i);
  462 }
  463 
  464 static void
  465 dn_buf_release(void *arg, void **store, int count)
  466 {
  467         struct mbufq *q;
  468         struct mbuf *m;
  469         int i;
  470 
  471         q = arg;
  472 
  473         for (i = 0; i < count; i++) {
  474                 m = store[i];
  475                 (void)mbufq_enqueue(q, m);
  476         }
  477 }
  478 
  479 static int
  480 dn_pack_import(void *arg __unused, void **store, int count, int domain __unused,
  481     int flags __unused)
  482 {
  483         struct mbuf *m;
  484         void *clust;
  485         int i;
  486 
  487         for (i = 0; i < count; i++) {
  488                 m = m_get(MT_DATA, M_NOWAIT);
  489                 if (m == NULL)
  490                         break;
  491                 clust = uma_zalloc(dn_zone_clust, M_NOWAIT);
  492                 if (clust == NULL) {
  493                         m_free(m);
  494                         break;
  495                 }
  496                 mb_ctor_clust(clust, dn_clsize, m, 0);
  497                 store[i] = m;
  498         }
  499         KASSERT((flags & M_WAITOK) == 0 || i == count,
  500             ("%s: ran out of pre-allocated mbufs", __func__));
  501         return (i);
  502 }
  503 
  504 static void
  505 dn_pack_release(void *arg __unused, void **store, int count)
  506 {
  507         struct mbuf *m;
  508         void *clust;
  509         int i;
  510 
  511         for (i = 0; i < count; i++) {
  512                 m = store[i];
  513                 clust = m->m_ext.ext_buf;
  514                 uma_zfree(dn_zone_clust, clust);
  515                 uma_zfree(dn_zone_mbuf, m);
  516         }
  517 }
  518 
  519 /*
  520  * Free the pre-allocated mbufs and clusters reserved for debugnet, and destroy
  521  * the corresponding UMA cache zones.
  522  */
  523 void
  524 debugnet_mbuf_drain(void)
  525 {
  526         struct mbuf *m;
  527         void *item;
  528 
  529         if (dn_zone_mbuf != NULL) {
  530                 uma_zdestroy(dn_zone_mbuf);
  531                 dn_zone_mbuf = NULL;
  532         }
  533         if (dn_zone_clust != NULL) {
  534                 uma_zdestroy(dn_zone_clust);
  535                 dn_zone_clust = NULL;
  536         }
  537         if (dn_zone_pack != NULL) {
  538                 uma_zdestroy(dn_zone_pack);
  539                 dn_zone_pack = NULL;
  540         }
  541 
  542         while ((m = mbufq_dequeue(&dn_mbufq)) != NULL)
  543                 m_free(m);
  544         while ((item = mbufq_dequeue(&dn_clustq)) != NULL)
  545                 uma_zfree(m_getzone(dn_clsize), item);
  546 }
  547 
  548 /*
  549  * Callback invoked immediately prior to starting a debugnet connection.
  550  */
  551 void
  552 debugnet_mbuf_start(void)
  553 {
  554 
  555         MPASS(!dn_saved_zones.dsz_debugnet_zones_enabled);
  556 
  557         /* Save the old zone pointers to restore when debugnet is closed. */
  558         dn_saved_zones = (struct debugnet_saved_zones) {
  559                 .dsz_debugnet_zones_enabled = true,
  560                 .dsz_mbuf = zone_mbuf,
  561                 .dsz_clust = zone_clust,
  562                 .dsz_pack = zone_pack,
  563                 .dsz_jumbop = zone_jumbop,
  564                 .dsz_jumbo9 = zone_jumbo9,
  565                 .dsz_jumbo16 = zone_jumbo16,
  566         };
  567 
  568         /*
  569          * All cluster zones return buffers of the size requested by the
  570          * drivers.  It's up to the driver to reinitialize the zones if the
  571          * MTU of a debugnet-enabled interface changes.
  572          */
  573         printf("debugnet: overwriting mbuf zone pointers\n");
  574         zone_mbuf = dn_zone_mbuf;
  575         zone_clust = dn_zone_clust;
  576         zone_pack = dn_zone_pack;
  577         zone_jumbop = dn_zone_clust;
  578         zone_jumbo9 = dn_zone_clust;
  579         zone_jumbo16 = dn_zone_clust;
  580 }
  581 
  582 /*
  583  * Callback invoked when a debugnet connection is closed/finished.
  584  */
  585 void
  586 debugnet_mbuf_finish(void)
  587 {
  588 
  589         MPASS(dn_saved_zones.dsz_debugnet_zones_enabled);
  590 
  591         printf("debugnet: restoring mbuf zone pointers\n");
  592         zone_mbuf = dn_saved_zones.dsz_mbuf;
  593         zone_clust = dn_saved_zones.dsz_clust;
  594         zone_pack = dn_saved_zones.dsz_pack;
  595         zone_jumbop = dn_saved_zones.dsz_jumbop;
  596         zone_jumbo9 = dn_saved_zones.dsz_jumbo9;
  597         zone_jumbo16 = dn_saved_zones.dsz_jumbo16;
  598 
  599         memset(&dn_saved_zones, 0, sizeof(dn_saved_zones));
  600 }
  601 
  602 /*
  603  * Reinitialize the debugnet mbuf+cluster pool and cache zones.
  604  */
  605 void
  606 debugnet_mbuf_reinit(int nmbuf, int nclust, int clsize)
  607 {
  608         struct mbuf *m;
  609         void *item;
  610 
  611         debugnet_mbuf_drain();
  612 
  613         dn_clsize = clsize;
  614 
  615         dn_zone_mbuf = uma_zcache_create("debugnet_" MBUF_MEM_NAME,
  616             MSIZE, mb_ctor_mbuf, mb_dtor_mbuf, NULL, NULL,
  617             dn_buf_import, dn_buf_release,
  618             &dn_mbufq, UMA_ZONE_NOBUCKET);
  619 
  620         dn_zone_clust = uma_zcache_create("debugnet_" MBUF_CLUSTER_MEM_NAME,
  621             clsize, mb_ctor_clust, NULL, NULL, NULL,
  622             dn_buf_import, dn_buf_release,
  623             &dn_clustq, UMA_ZONE_NOBUCKET);
  624 
  625         dn_zone_pack = uma_zcache_create("debugnet_" MBUF_PACKET_MEM_NAME,
  626             MCLBYTES, mb_ctor_pack, mb_dtor_pack, NULL, NULL,
  627             dn_pack_import, dn_pack_release,
  628             NULL, UMA_ZONE_NOBUCKET);
  629 
  630         while (nmbuf-- > 0) {
  631                 m = m_get(MT_DATA, M_WAITOK);
  632                 uma_zfree(dn_zone_mbuf, m);
  633         }
  634         while (nclust-- > 0) {
  635                 item = uma_zalloc(m_getzone(dn_clsize), M_WAITOK);
  636                 uma_zfree(dn_zone_clust, item);
  637         }
  638 }
  639 #endif /* DEBUGNET */
  640 
  641 /*
  642  * Constructor for Mbuf primary zone.
  643  *
  644  * The 'arg' pointer points to a mb_args structure which
  645  * contains call-specific information required to support the
  646  * mbuf allocation API.  See mbuf.h.
  647  */
  648 static int
  649 mb_ctor_mbuf(void *mem, int size, void *arg, int how)
  650 {
  651         struct mbuf *m;
  652         struct mb_args *args;
  653         int error;
  654         int flags;
  655         short type;
  656 
  657         args = (struct mb_args *)arg;
  658         type = args->type;
  659 
  660         /*
  661          * The mbuf is initialized later.  The caller has the
  662          * responsibility to set up any MAC labels too.
  663          */
  664         if (type == MT_NOINIT)
  665                 return (0);
  666 
  667         m = (struct mbuf *)mem;
  668         flags = args->flags;
  669         MPASS((flags & M_NOFREE) == 0);
  670 
  671         error = m_init(m, how, type, flags);
  672 
  673         return (error);
  674 }
  675 
  676 /*
  677  * The Mbuf primary zone destructor.
  678  */
  679 static void
  680 mb_dtor_mbuf(void *mem, int size, void *arg)
  681 {
  682         struct mbuf *m;
  683         unsigned long flags;
  684 
  685         m = (struct mbuf *)mem;
  686         flags = (unsigned long)arg;
  687 
  688         KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
  689         if (!(flags & MB_DTOR_SKIP) && (m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
  690                 m_tag_delete_chain(m, NULL);
  691 }
  692 
  693 /*
  694  * The Mbuf Packet zone destructor.
  695  */
  696 static void
  697 mb_dtor_pack(void *mem, int size, void *arg)
  698 {
  699         struct mbuf *m;
  700 
  701         m = (struct mbuf *)mem;
  702         if ((m->m_flags & M_PKTHDR) != 0)
  703                 m_tag_delete_chain(m, NULL);
  704 
  705         /* Make sure we've got a clean cluster back. */
  706         KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
  707         KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
  708         KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
  709         KASSERT(m->m_ext.ext_arg1 == NULL, ("%s: ext_arg1 != NULL", __func__));
  710         KASSERT(m->m_ext.ext_arg2 == NULL, ("%s: ext_arg2 != NULL", __func__));
  711         KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
  712         KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
  713 #ifdef INVARIANTS
  714         trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
  715 #endif
  716         /*
  717          * If there are processes blocked on zone_clust, waiting for pages
  718          * to be freed up, cause them to be woken up by draining the
  719          * packet zone.  We are exposed to a race here (in the check for
  720          * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
  721          * is deliberate. We don't want to acquire the zone lock for every
  722          * mbuf free.
  723          */
  724         if (uma_zone_exhausted(zone_clust))
  725                 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
  726 }
  727 
  728 /*
  729  * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
  730  *
  731  * Here the 'arg' pointer points to the Mbuf which we
  732  * are configuring cluster storage for.  If 'arg' is
  733  * empty we allocate just the cluster without setting
  734  * the mbuf to it.  See mbuf.h.
  735  */
  736 static int
  737 mb_ctor_clust(void *mem, int size, void *arg, int how)
  738 {
  739         struct mbuf *m;
  740 
  741         m = (struct mbuf *)arg;
  742         if (m != NULL) {
  743                 m->m_ext.ext_buf = (char *)mem;
  744                 m->m_data = m->m_ext.ext_buf;
  745                 m->m_flags |= M_EXT;
  746                 m->m_ext.ext_free = NULL;
  747                 m->m_ext.ext_arg1 = NULL;
  748                 m->m_ext.ext_arg2 = NULL;
  749                 m->m_ext.ext_size = size;
  750                 m->m_ext.ext_type = m_gettype(size);
  751                 m->m_ext.ext_flags = EXT_FLAG_EMBREF;
  752                 m->m_ext.ext_count = 1;
  753         }
  754 
  755         return (0);
  756 }
  757 
  758 /*
  759  * The Packet secondary zone's init routine, executed on the
  760  * object's transition from mbuf keg slab to zone cache.
  761  */
  762 static int
  763 mb_zinit_pack(void *mem, int size, int how)
  764 {
  765         struct mbuf *m;
  766 
  767         m = (struct mbuf *)mem;         /* m is virgin. */
  768         if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
  769             m->m_ext.ext_buf == NULL)
  770                 return (ENOMEM);
  771         m->m_ext.ext_type = EXT_PACKET; /* Override. */
  772 #ifdef INVARIANTS
  773         trash_init(m->m_ext.ext_buf, MCLBYTES, how);
  774 #endif
  775         return (0);
  776 }
  777 
  778 /*
  779  * The Packet secondary zone's fini routine, executed on the
  780  * object's transition from zone cache to keg slab.
  781  */
  782 static void
  783 mb_zfini_pack(void *mem, int size)
  784 {
  785         struct mbuf *m;
  786 
  787         m = (struct mbuf *)mem;
  788 #ifdef INVARIANTS
  789         trash_fini(m->m_ext.ext_buf, MCLBYTES);
  790 #endif
  791         uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
  792 #ifdef INVARIANTS
  793         trash_dtor(mem, size, NULL);
  794 #endif
  795 }
  796 
  797 /*
  798  * The "packet" keg constructor.
  799  */
  800 static int
  801 mb_ctor_pack(void *mem, int size, void *arg, int how)
  802 {
  803         struct mbuf *m;
  804         struct mb_args *args;
  805         int error, flags;
  806         short type;
  807 
  808         m = (struct mbuf *)mem;
  809         args = (struct mb_args *)arg;
  810         flags = args->flags;
  811         type = args->type;
  812         MPASS((flags & M_NOFREE) == 0);
  813 
  814 #ifdef INVARIANTS
  815         trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
  816 #endif
  817 
  818         error = m_init(m, how, type, flags);
  819 
  820         /* m_ext is already initialized. */
  821         m->m_data = m->m_ext.ext_buf;
  822         m->m_flags = (flags | M_EXT);
  823 
  824         return (error);
  825 }
  826 
  827 /*
  828  * This is the protocol drain routine.  Called by UMA whenever any of the
  829  * mbuf zones is closed to its limit.
  830  *
  831  * No locks should be held when this is called.  The drain routines have to
  832  * presently acquire some locks which raises the possibility of lock order
  833  * reversal.
  834  */
  835 static void
  836 mb_reclaim(uma_zone_t zone __unused, int pending __unused)
  837 {
  838         struct epoch_tracker et;
  839         struct domain *dp;
  840         struct protosw *pr;
  841 
  842         WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL, __func__);
  843 
  844         NET_EPOCH_ENTER(et);
  845         for (dp = domains; dp != NULL; dp = dp->dom_next)
  846                 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
  847                         if (pr->pr_drain != NULL)
  848                                 (*pr->pr_drain)();
  849         NET_EPOCH_EXIT(et);
  850 }
  851 
  852 /*
  853  * Free "count" units of I/O from an mbuf chain.  They could be held
  854  * in M_EXTPG or just as a normal mbuf.  This code is intended to be
  855  * called in an error path (I/O error, closed connection, etc).
  856  */
  857 void
  858 mb_free_notready(struct mbuf *m, int count)
  859 {
  860         int i;
  861 
  862         for (i = 0; i < count && m != NULL; i++) {
  863                 if ((m->m_flags & M_EXTPG) != 0) {
  864                         m->m_epg_nrdy--;
  865                         if (m->m_epg_nrdy != 0)
  866                                 continue;
  867                 }
  868                 m = m_free(m);
  869         }
  870         KASSERT(i == count, ("Removed only %d items from %p", i, m));
  871 }
  872 
  873 /*
  874  * Compress an unmapped mbuf into a simple mbuf when it holds a small
  875  * amount of data.  This is used as a DOS defense to avoid having
  876  * small packets tie up wired pages, an ext_pgs structure, and an
  877  * mbuf.  Since this converts the existing mbuf in place, it can only
  878  * be used if there are no other references to 'm'.
  879  */
  880 int
  881 mb_unmapped_compress(struct mbuf *m)
  882 {
  883         volatile u_int *refcnt;
  884         char buf[MLEN];
  885 
  886         /*
  887          * Assert that 'm' does not have a packet header.  If 'm' had
  888          * a packet header, it would only be able to hold MHLEN bytes
  889          * and m_data would have to be initialized differently.
  890          */
  891         KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXTPG),
  892             ("%s: m %p !M_EXTPG or M_PKTHDR", __func__, m));
  893         KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
  894 
  895         if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
  896                 refcnt = &m->m_ext.ext_count;
  897         } else {
  898                 KASSERT(m->m_ext.ext_cnt != NULL,
  899                     ("%s: no refcounting pointer on %p", __func__, m));
  900                 refcnt = m->m_ext.ext_cnt;
  901         }
  902 
  903         if (*refcnt != 1)
  904                 return (EBUSY);
  905 
  906         m_copydata(m, 0, m->m_len, buf);
  907 
  908         /* Free the backing pages. */
  909         m->m_ext.ext_free(m);
  910 
  911         /* Turn 'm' into a "normal" mbuf. */
  912         m->m_flags &= ~(M_EXT | M_RDONLY | M_EXTPG);
  913         m->m_data = m->m_dat;
  914 
  915         /* Copy data back into m. */
  916         bcopy(buf, mtod(m, char *), m->m_len);
  917 
  918         return (0);
  919 }
  920 
  921 /*
  922  * These next few routines are used to permit downgrading an unmapped
  923  * mbuf to a chain of mapped mbufs.  This is used when an interface
  924  * doesn't supported unmapped mbufs or if checksums need to be
  925  * computed in software.
  926  *
  927  * Each unmapped mbuf is converted to a chain of mbufs.  First, any
  928  * TLS header data is stored in a regular mbuf.  Second, each page of
  929  * unmapped data is stored in an mbuf with an EXT_SFBUF external
  930  * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
  931  * associated physical page.  They also hold a reference on the
  932  * original M_EXTPG mbuf to ensure the physical page doesn't go away.
  933  * Finally, any TLS trailer data is stored in a regular mbuf.
  934  *
  935  * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
  936  * mbufs.  It frees the associated sf_buf and releases its reference
  937  * on the original M_EXTPG mbuf.
  938  *
  939  * _mb_unmapped_to_ext() is a helper function that converts a single
  940  * unmapped mbuf into a chain of mbufs.
  941  *
  942  * mb_unmapped_to_ext() is the public function that walks an mbuf
  943  * chain converting any unmapped mbufs to mapped mbufs.  It returns
  944  * the new chain of unmapped mbufs on success.  On failure it frees
  945  * the original mbuf chain and returns NULL.
  946  */
  947 static void
  948 mb_unmapped_free_mext(struct mbuf *m)
  949 {
  950         struct sf_buf *sf;
  951         struct mbuf *old_m;
  952 
  953         sf = m->m_ext.ext_arg1;
  954         sf_buf_free(sf);
  955 
  956         /* Drop the reference on the backing M_EXTPG mbuf. */
  957         old_m = m->m_ext.ext_arg2;
  958         mb_free_extpg(old_m);
  959 }
  960 
  961 static struct mbuf *
  962 _mb_unmapped_to_ext(struct mbuf *m)
  963 {
  964         struct mbuf *m_new, *top, *prev, *mref;
  965         struct sf_buf *sf;
  966         vm_page_t pg;
  967         int i, len, off, pglen, pgoff, seglen, segoff;
  968         volatile u_int *refcnt;
  969         u_int ref_inc = 0;
  970 
  971         M_ASSERTEXTPG(m);
  972         len = m->m_len;
  973         KASSERT(m->m_epg_tls == NULL, ("%s: can't convert TLS mbuf %p",
  974             __func__, m));
  975 
  976         /* See if this is the mbuf that holds the embedded refcount. */
  977         if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
  978                 refcnt = &m->m_ext.ext_count;
  979                 mref = m;
  980         } else {
  981                 KASSERT(m->m_ext.ext_cnt != NULL,
  982                     ("%s: no refcounting pointer on %p", __func__, m));
  983                 refcnt = m->m_ext.ext_cnt;
  984                 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
  985         }
  986 
  987         /* Skip over any data removed from the front. */
  988         off = mtod(m, vm_offset_t);
  989 
  990         top = NULL;
  991         if (m->m_epg_hdrlen != 0) {
  992                 if (off >= m->m_epg_hdrlen) {
  993                         off -= m->m_epg_hdrlen;
  994                 } else {
  995                         seglen = m->m_epg_hdrlen - off;
  996                         segoff = off;
  997                         seglen = min(seglen, len);
  998                         off = 0;
  999                         len -= seglen;
 1000                         m_new = m_get(M_NOWAIT, MT_DATA);
 1001                         if (m_new == NULL)
 1002                                 goto fail;
 1003                         m_new->m_len = seglen;
 1004                         prev = top = m_new;
 1005                         memcpy(mtod(m_new, void *), &m->m_epg_hdr[segoff],
 1006                             seglen);
 1007                 }
 1008         }
 1009         pgoff = m->m_epg_1st_off;
 1010         for (i = 0; i < m->m_epg_npgs && len > 0; i++) {
 1011                 pglen = m_epg_pagelen(m, i, pgoff);
 1012                 if (off >= pglen) {
 1013                         off -= pglen;
 1014                         pgoff = 0;
 1015                         continue;
 1016                 }
 1017                 seglen = pglen - off;
 1018                 segoff = pgoff + off;
 1019                 off = 0;
 1020                 seglen = min(seglen, len);
 1021                 len -= seglen;
 1022 
 1023                 pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 1024                 m_new = m_get(M_NOWAIT, MT_DATA);
 1025                 if (m_new == NULL)
 1026                         goto fail;
 1027                 if (top == NULL) {
 1028                         top = prev = m_new;
 1029                 } else {
 1030                         prev->m_next = m_new;
 1031                         prev = m_new;
 1032                 }
 1033                 sf = sf_buf_alloc(pg, SFB_NOWAIT);
 1034                 if (sf == NULL)
 1035                         goto fail;
 1036 
 1037                 ref_inc++;
 1038                 m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
 1039                     mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
 1040                 m_new->m_data += segoff;
 1041                 m_new->m_len = seglen;
 1042 
 1043                 pgoff = 0;
 1044         };
 1045         if (len != 0) {
 1046                 KASSERT((off + len) <= m->m_epg_trllen,
 1047                     ("off + len > trail (%d + %d > %d)", off, len,
 1048                     m->m_epg_trllen));
 1049                 m_new = m_get(M_NOWAIT, MT_DATA);
 1050                 if (m_new == NULL)
 1051                         goto fail;
 1052                 if (top == NULL)
 1053                         top = m_new;
 1054                 else
 1055                         prev->m_next = m_new;
 1056                 m_new->m_len = len;
 1057                 memcpy(mtod(m_new, void *), &m->m_epg_trail[off], len);
 1058         }
 1059 
 1060         if (ref_inc != 0) {
 1061                 /*
 1062                  * Obtain an additional reference on the old mbuf for
 1063                  * each created EXT_SFBUF mbuf.  They will be dropped
 1064                  * in mb_unmapped_free_mext().
 1065                  */
 1066                 if (*refcnt == 1)
 1067                         *refcnt += ref_inc;
 1068                 else
 1069                         atomic_add_int(refcnt, ref_inc);
 1070         }
 1071         m_free(m);
 1072         return (top);
 1073 
 1074 fail:
 1075         if (ref_inc != 0) {
 1076                 /*
 1077                  * Obtain an additional reference on the old mbuf for
 1078                  * each created EXT_SFBUF mbuf.  They will be
 1079                  * immediately dropped when these mbufs are freed
 1080                  * below.
 1081                  */
 1082                 if (*refcnt == 1)
 1083                         *refcnt += ref_inc;
 1084                 else
 1085                         atomic_add_int(refcnt, ref_inc);
 1086         }
 1087         m_free(m);
 1088         m_freem(top);
 1089         return (NULL);
 1090 }
 1091 
 1092 struct mbuf *
 1093 mb_unmapped_to_ext(struct mbuf *top)
 1094 {
 1095         struct mbuf *m, *next, *prev = NULL;
 1096 
 1097         prev = NULL;
 1098         for (m = top; m != NULL; m = next) {
 1099                 /* m might be freed, so cache the next pointer. */
 1100                 next = m->m_next;
 1101                 if (m->m_flags & M_EXTPG) {
 1102                         if (prev != NULL) {
 1103                                 /*
 1104                                  * Remove 'm' from the new chain so
 1105                                  * that the 'top' chain terminates
 1106                                  * before 'm' in case 'top' is freed
 1107                                  * due to an error.
 1108                                  */
 1109                                 prev->m_next = NULL;
 1110                         }
 1111                         m = _mb_unmapped_to_ext(m);
 1112                         if (m == NULL) {
 1113                                 m_freem(top);
 1114                                 m_freem(next);
 1115                                 return (NULL);
 1116                         }
 1117                         if (prev == NULL) {
 1118                                 top = m;
 1119                         } else {
 1120                                 prev->m_next = m;
 1121                         }
 1122 
 1123                         /*
 1124                          * Replaced one mbuf with a chain, so we must
 1125                          * find the end of chain.
 1126                          */
 1127                         prev = m_last(m);
 1128                 } else {
 1129                         if (prev != NULL) {
 1130                                 prev->m_next = m;
 1131                         }
 1132                         prev = m;
 1133                 }
 1134         }
 1135         return (top);
 1136 }
 1137 
 1138 /*
 1139  * Allocate an empty M_EXTPG mbuf.  The ext_free routine is
 1140  * responsible for freeing any pages backing this mbuf when it is
 1141  * freed.
 1142  */
 1143 struct mbuf *
 1144 mb_alloc_ext_pgs(int how, m_ext_free_t ext_free)
 1145 {
 1146         struct mbuf *m;
 1147 
 1148         m = m_get(how, MT_DATA);
 1149         if (m == NULL)
 1150                 return (NULL);
 1151 
 1152         m->m_epg_npgs = 0;
 1153         m->m_epg_nrdy = 0;
 1154         m->m_epg_1st_off = 0;
 1155         m->m_epg_last_len = 0;
 1156         m->m_epg_flags = 0;
 1157         m->m_epg_hdrlen = 0;
 1158         m->m_epg_trllen = 0;
 1159         m->m_epg_tls = NULL;
 1160         m->m_epg_so = NULL;
 1161         m->m_data = NULL;
 1162         m->m_flags |= (M_EXT | M_RDONLY | M_EXTPG);
 1163         m->m_ext.ext_flags = EXT_FLAG_EMBREF;
 1164         m->m_ext.ext_count = 1;
 1165         m->m_ext.ext_size = 0;
 1166         m->m_ext.ext_free = ext_free;
 1167         return (m);
 1168 }
 1169 
 1170 /*
 1171  * Clean up after mbufs with M_EXT storage attached to them if the
 1172  * reference count hits 1.
 1173  */
 1174 void
 1175 mb_free_ext(struct mbuf *m)
 1176 {
 1177         volatile u_int *refcnt;
 1178         struct mbuf *mref;
 1179         int freembuf;
 1180 
 1181         KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 1182 
 1183         /* See if this is the mbuf that holds the embedded refcount. */
 1184         if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 1185                 refcnt = &m->m_ext.ext_count;
 1186                 mref = m;
 1187         } else {
 1188                 KASSERT(m->m_ext.ext_cnt != NULL,
 1189                     ("%s: no refcounting pointer on %p", __func__, m));
 1190                 refcnt = m->m_ext.ext_cnt;
 1191                 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 1192         }
 1193 
 1194         /*
 1195          * Check if the header is embedded in the cluster.  It is
 1196          * important that we can't touch any of the mbuf fields
 1197          * after we have freed the external storage, since mbuf
 1198          * could have been embedded in it.  For now, the mbufs
 1199          * embedded into the cluster are always of type EXT_EXTREF,
 1200          * and for this type we won't free the mref.
 1201          */
 1202         if (m->m_flags & M_NOFREE) {
 1203                 freembuf = 0;
 1204                 KASSERT(m->m_ext.ext_type == EXT_EXTREF ||
 1205                     m->m_ext.ext_type == EXT_RXRING,
 1206                     ("%s: no-free mbuf %p has wrong type", __func__, m));
 1207         } else
 1208                 freembuf = 1;
 1209 
 1210         /* Free attached storage if this mbuf is the only reference to it. */
 1211         if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 1212                 switch (m->m_ext.ext_type) {
 1213                 case EXT_PACKET:
 1214                         /* The packet zone is special. */
 1215                         if (*refcnt == 0)
 1216                                 *refcnt = 1;
 1217                         uma_zfree(zone_pack, mref);
 1218                         break;
 1219                 case EXT_CLUSTER:
 1220                         uma_zfree(zone_clust, m->m_ext.ext_buf);
 1221                         uma_zfree(zone_mbuf, mref);
 1222                         break;
 1223                 case EXT_JUMBOP:
 1224                         uma_zfree(zone_jumbop, m->m_ext.ext_buf);
 1225                         uma_zfree(zone_mbuf, mref);
 1226                         break;
 1227                 case EXT_JUMBO9:
 1228                         uma_zfree(zone_jumbo9, m->m_ext.ext_buf);
 1229                         uma_zfree(zone_mbuf, mref);
 1230                         break;
 1231                 case EXT_JUMBO16:
 1232                         uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 1233                         uma_zfree(zone_mbuf, mref);
 1234                         break;
 1235                 case EXT_SFBUF:
 1236                 case EXT_NET_DRV:
 1237                 case EXT_MOD_TYPE:
 1238                 case EXT_DISPOSABLE:
 1239                         KASSERT(mref->m_ext.ext_free != NULL,
 1240                             ("%s: ext_free not set", __func__));
 1241                         mref->m_ext.ext_free(mref);
 1242                         uma_zfree(zone_mbuf, mref);
 1243                         break;
 1244                 case EXT_EXTREF:
 1245                         KASSERT(m->m_ext.ext_free != NULL,
 1246                             ("%s: ext_free not set", __func__));
 1247                         m->m_ext.ext_free(m);
 1248                         break;
 1249                 case EXT_RXRING:
 1250                         KASSERT(m->m_ext.ext_free == NULL,
 1251                             ("%s: ext_free is set", __func__));
 1252                         break;
 1253                 default:
 1254                         KASSERT(m->m_ext.ext_type == 0,
 1255                             ("%s: unknown ext_type", __func__));
 1256                 }
 1257         }
 1258 
 1259         if (freembuf && m != mref)
 1260                 uma_zfree(zone_mbuf, m);
 1261 }
 1262 
 1263 /*
 1264  * Clean up after mbufs with M_EXTPG storage attached to them if the
 1265  * reference count hits 1.
 1266  */
 1267 void
 1268 mb_free_extpg(struct mbuf *m)
 1269 {
 1270         volatile u_int *refcnt;
 1271         struct mbuf *mref;
 1272 
 1273         M_ASSERTEXTPG(m);
 1274 
 1275         /* See if this is the mbuf that holds the embedded refcount. */
 1276         if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
 1277                 refcnt = &m->m_ext.ext_count;
 1278                 mref = m;
 1279         } else {
 1280                 KASSERT(m->m_ext.ext_cnt != NULL,
 1281                     ("%s: no refcounting pointer on %p", __func__, m));
 1282                 refcnt = m->m_ext.ext_cnt;
 1283                 mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
 1284         }
 1285 
 1286         /* Free attached storage if this mbuf is the only reference to it. */
 1287         if (*refcnt == 1 || atomic_fetchadd_int(refcnt, -1) == 1) {
 1288                 KASSERT(mref->m_ext.ext_free != NULL,
 1289                     ("%s: ext_free not set", __func__));
 1290 
 1291                 mref->m_ext.ext_free(mref);
 1292 #ifdef KERN_TLS
 1293                 if (mref->m_epg_tls != NULL &&
 1294                     !refcount_release_if_not_last(&mref->m_epg_tls->refcount))
 1295                         ktls_enqueue_to_free(mref);
 1296                 else
 1297 #endif
 1298                         uma_zfree(zone_mbuf, mref);
 1299         }
 1300 
 1301         if (m != mref)
 1302                 uma_zfree(zone_mbuf, m);
 1303 }
 1304 
 1305 /*
 1306  * Official mbuf(9) allocation KPI for stack and drivers:
 1307  *
 1308  * m_get()      - a single mbuf without any attachments, sys/mbuf.h.
 1309  * m_gethdr()   - a single mbuf initialized as M_PKTHDR, sys/mbuf.h.
 1310  * m_getcl()    - an mbuf + 2k cluster, sys/mbuf.h.
 1311  * m_clget()    - attach cluster to already allocated mbuf.
 1312  * m_cljget()   - attach jumbo cluster to already allocated mbuf.
 1313  * m_get2()     - allocate minimum mbuf that would fit size argument.
 1314  * m_getm2()    - allocate a chain of mbufs/clusters.
 1315  * m_extadd()   - attach external cluster to mbuf.
 1316  *
 1317  * m_free()     - free single mbuf with its tags and ext, sys/mbuf.h.
 1318  * m_freem()    - free chain of mbufs.
 1319  */
 1320 
 1321 int
 1322 m_clget(struct mbuf *m, int how)
 1323 {
 1324 
 1325         KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 1326             __func__, m));
 1327         m->m_ext.ext_buf = (char *)NULL;
 1328         uma_zalloc_arg(zone_clust, m, how);
 1329         /*
 1330          * On a cluster allocation failure, drain the packet zone and retry,
 1331          * we might be able to loosen a few clusters up on the drain.
 1332          */
 1333         if ((how & M_NOWAIT) && (m->m_ext.ext_buf == NULL)) {
 1334                 uma_zone_reclaim(zone_pack, UMA_RECLAIM_DRAIN);
 1335                 uma_zalloc_arg(zone_clust, m, how);
 1336         }
 1337         MBUF_PROBE2(m__clget, m, how);
 1338         return (m->m_flags & M_EXT);
 1339 }
 1340 
 1341 /*
 1342  * m_cljget() is different from m_clget() as it can allocate clusters without
 1343  * attaching them to an mbuf.  In that case the return value is the pointer
 1344  * to the cluster of the requested size.  If an mbuf was specified, it gets
 1345  * the cluster attached to it and the return value can be safely ignored.
 1346  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
 1347  */
 1348 void *
 1349 m_cljget(struct mbuf *m, int how, int size)
 1350 {
 1351         uma_zone_t zone;
 1352         void *retval;
 1353 
 1354         if (m != NULL) {
 1355                 KASSERT((m->m_flags & M_EXT) == 0, ("%s: mbuf %p has M_EXT",
 1356                     __func__, m));
 1357                 m->m_ext.ext_buf = NULL;
 1358         }
 1359 
 1360         zone = m_getzone(size);
 1361         retval = uma_zalloc_arg(zone, m, how);
 1362 
 1363         MBUF_PROBE4(m__cljget, m, how, size, retval);
 1364 
 1365         return (retval);
 1366 }
 1367 
 1368 /*
 1369  * m_get2() allocates minimum mbuf that would fit "size" argument.
 1370  */
 1371 struct mbuf *
 1372 m_get2(int size, int how, short type, int flags)
 1373 {
 1374         struct mb_args args;
 1375         struct mbuf *m, *n;
 1376 
 1377         args.flags = flags;
 1378         args.type = type;
 1379 
 1380         if (size <= MHLEN || (size <= MLEN && (flags & M_PKTHDR) == 0))
 1381                 return (uma_zalloc_arg(zone_mbuf, &args, how));
 1382         if (size <= MCLBYTES)
 1383                 return (uma_zalloc_arg(zone_pack, &args, how));
 1384 
 1385         if (size > MJUMPAGESIZE)
 1386                 return (NULL);
 1387 
 1388         m = uma_zalloc_arg(zone_mbuf, &args, how);
 1389         if (m == NULL)
 1390                 return (NULL);
 1391 
 1392         n = uma_zalloc_arg(zone_jumbop, m, how);
 1393         if (n == NULL) {
 1394                 uma_zfree(zone_mbuf, m);
 1395                 return (NULL);
 1396         }
 1397 
 1398         return (m);
 1399 }
 1400 
 1401 /*
 1402  * m_getjcl() returns an mbuf with a cluster of the specified size attached.
 1403  * For size it takes MCLBYTES, MJUMPAGESIZE, MJUM9BYTES, MJUM16BYTES.
 1404  */
 1405 struct mbuf *
 1406 m_getjcl(int how, short type, int flags, int size)
 1407 {
 1408         struct mb_args args;
 1409         struct mbuf *m, *n;
 1410         uma_zone_t zone;
 1411 
 1412         if (size == MCLBYTES)
 1413                 return m_getcl(how, type, flags);
 1414 
 1415         args.flags = flags;
 1416         args.type = type;
 1417 
 1418         m = uma_zalloc_arg(zone_mbuf, &args, how);
 1419         if (m == NULL)
 1420                 return (NULL);
 1421 
 1422         zone = m_getzone(size);
 1423         n = uma_zalloc_arg(zone, m, how);
 1424         if (n == NULL) {
 1425                 uma_zfree(zone_mbuf, m);
 1426                 return (NULL);
 1427         }
 1428         MBUF_PROBE5(m__getjcl, how, type, flags, size, m);
 1429         return (m);
 1430 }
 1431 
 1432 /*
 1433  * Allocate a given length worth of mbufs and/or clusters (whatever fits
 1434  * best) and return a pointer to the top of the allocated chain.  If an
 1435  * existing mbuf chain is provided, then we will append the new chain
 1436  * to the existing one and return a pointer to the provided mbuf.
 1437  */
 1438 struct mbuf *
 1439 m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 1440 {
 1441         struct mbuf *mb, *nm = NULL, *mtail = NULL;
 1442 
 1443         KASSERT(len >= 0, ("%s: len is < 0", __func__));
 1444 
 1445         /* Validate flags. */
 1446         flags &= (M_PKTHDR | M_EOR);
 1447 
 1448         /* Packet header mbuf must be first in chain. */
 1449         if ((flags & M_PKTHDR) && m != NULL)
 1450                 flags &= ~M_PKTHDR;
 1451 
 1452         /* Loop and append maximum sized mbufs to the chain tail. */
 1453         while (len > 0) {
 1454                 mb = NULL;
 1455                 if (len > MCLBYTES) {
 1456                         mb = m_getjcl(M_NOWAIT, type, (flags & M_PKTHDR),
 1457                             MJUMPAGESIZE);
 1458                 }
 1459                 if (mb == NULL) {
 1460                         if (len >= MINCLSIZE)
 1461                                 mb = m_getcl(how, type, (flags & M_PKTHDR));
 1462                         else if (flags & M_PKTHDR)
 1463                                 mb = m_gethdr(how, type);
 1464                         else
 1465                                 mb = m_get(how, type);
 1466 
 1467                         /*
 1468                          * Fail the whole operation if one mbuf can't be
 1469                          * allocated.
 1470                          */
 1471                         if (mb == NULL) {
 1472                                 m_freem(nm);
 1473                                 return (NULL);
 1474                         }
 1475                 }
 1476 
 1477                 /* Book keeping. */
 1478                 len -= M_SIZE(mb);
 1479                 if (mtail != NULL)
 1480                         mtail->m_next = mb;
 1481                 else
 1482                         nm = mb;
 1483                 mtail = mb;
 1484                 flags &= ~M_PKTHDR;     /* Only valid on the first mbuf. */
 1485         }
 1486         if (flags & M_EOR)
 1487                 mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
 1488 
 1489         /* If mbuf was supplied, append new chain to the end of it. */
 1490         if (m != NULL) {
 1491                 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
 1492                         ;
 1493                 mtail->m_next = nm;
 1494                 mtail->m_flags &= ~M_EOR;
 1495         } else
 1496                 m = nm;
 1497 
 1498         return (m);
 1499 }
 1500 
 1501 /*-
 1502  * Configure a provided mbuf to refer to the provided external storage
 1503  * buffer and setup a reference count for said buffer.
 1504  *
 1505  * Arguments:
 1506  *    mb     The existing mbuf to which to attach the provided buffer.
 1507  *    buf    The address of the provided external storage buffer.
 1508  *    size   The size of the provided buffer.
 1509  *    freef  A pointer to a routine that is responsible for freeing the
 1510  *           provided external storage buffer.
 1511  *    args   A pointer to an argument structure (of any type) to be passed
 1512  *           to the provided freef routine (may be NULL).
 1513  *    flags  Any other flags to be passed to the provided mbuf.
 1514  *    type   The type that the external storage buffer should be
 1515  *           labeled with.
 1516  *
 1517  * Returns:
 1518  *    Nothing.
 1519  */
 1520 void
 1521 m_extadd(struct mbuf *mb, char *buf, u_int size, m_ext_free_t freef,
 1522     void *arg1, void *arg2, int flags, int type)
 1523 {
 1524 
 1525         KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 1526 
 1527         mb->m_flags |= (M_EXT | flags);
 1528         mb->m_ext.ext_buf = buf;
 1529         mb->m_data = mb->m_ext.ext_buf;
 1530         mb->m_ext.ext_size = size;
 1531         mb->m_ext.ext_free = freef;
 1532         mb->m_ext.ext_arg1 = arg1;
 1533         mb->m_ext.ext_arg2 = arg2;
 1534         mb->m_ext.ext_type = type;
 1535 
 1536         if (type != EXT_EXTREF) {
 1537                 mb->m_ext.ext_count = 1;
 1538                 mb->m_ext.ext_flags = EXT_FLAG_EMBREF;
 1539         } else
 1540                 mb->m_ext.ext_flags = 0;
 1541 }
 1542 
 1543 /*
 1544  * Free an entire chain of mbufs and associated external buffers, if
 1545  * applicable.
 1546  */
 1547 void
 1548 m_freem(struct mbuf *mb)
 1549 {
 1550 
 1551         MBUF_PROBE1(m__freem, mb);
 1552         while (mb != NULL)
 1553                 mb = m_free(mb);
 1554 }
 1555 
 1556 /*
 1557  * Temporary primitive to allow freeing without going through m_free.
 1558  */
 1559 void
 1560 m_free_raw(struct mbuf *mb)
 1561 {
 1562 
 1563         uma_zfree(zone_mbuf, mb);
 1564 }
 1565 
 1566 int
 1567 m_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
 1568     struct m_snd_tag **mstp)
 1569 {
 1570 
 1571         if (ifp->if_snd_tag_alloc == NULL)
 1572                 return (EOPNOTSUPP);
 1573         return (ifp->if_snd_tag_alloc(ifp, params, mstp));
 1574 }
 1575 
 1576 void
 1577 m_snd_tag_init(struct m_snd_tag *mst, struct ifnet *ifp, u_int type)
 1578 {
 1579 
 1580         if_ref(ifp);
 1581         mst->ifp = ifp;
 1582         refcount_init(&mst->refcount, 1);
 1583         mst->type = type;
 1584         counter_u64_add(snd_tag_count, 1);
 1585 }
 1586 
 1587 void
 1588 m_snd_tag_destroy(struct m_snd_tag *mst)
 1589 {
 1590         struct ifnet *ifp;
 1591 
 1592         ifp = mst->ifp;
 1593         ifp->if_snd_tag_free(mst);
 1594         if_rele(ifp);
 1595         counter_u64_add(snd_tag_count, -1);
 1596 }
 1597 
 1598 /*
 1599  * Allocate an mbuf with anonymous external pages.
 1600  */
 1601 struct mbuf *
 1602 mb_alloc_ext_plus_pages(int len, int how)
 1603 {
 1604         struct mbuf *m;
 1605         vm_page_t pg;
 1606         int i, npgs;
 1607 
 1608         m = mb_alloc_ext_pgs(how, mb_free_mext_pgs);
 1609         if (m == NULL)
 1610                 return (NULL);
 1611         m->m_epg_flags |= EPG_FLAG_ANON;
 1612         npgs = howmany(len, PAGE_SIZE);
 1613         for (i = 0; i < npgs; i++) {
 1614                 do {
 1615                         pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 1616                             VM_ALLOC_WIRED);
 1617                         if (pg == NULL) {
 1618                                 if (how == M_NOWAIT) {
 1619                                         m->m_epg_npgs = i;
 1620                                         m_free(m);
 1621                                         return (NULL);
 1622                                 }
 1623                                 vm_wait(NULL);
 1624                         }
 1625                 } while (pg == NULL);
 1626                 m->m_epg_pa[i] = VM_PAGE_TO_PHYS(pg);
 1627         }
 1628         m->m_epg_npgs = npgs;
 1629         return (m);
 1630 }
 1631 
 1632 /*
 1633  * Copy the data in the mbuf chain to a chain of mbufs with anonymous external
 1634  * unmapped pages.
 1635  * len is the length of data in the input mbuf chain.
 1636  * mlen is the maximum number of bytes put into each ext_page mbuf.
 1637  */
 1638 struct mbuf *
 1639 mb_mapped_to_unmapped(struct mbuf *mp, int len, int mlen, int how,
 1640     struct mbuf **mlast)
 1641 {
 1642         struct mbuf *m, *mout;
 1643         char *pgpos, *mbpos;
 1644         int i, mblen, mbufsiz, pglen, xfer;
 1645 
 1646         if (len == 0)
 1647                 return (NULL);
 1648         mbufsiz = min(mlen, len);
 1649         m = mout = mb_alloc_ext_plus_pages(mbufsiz, how);
 1650         if (m == NULL)
 1651                 return (m);
 1652         pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[0]);
 1653         pglen = PAGE_SIZE;
 1654         mblen = 0;
 1655         i = 0;
 1656         do {
 1657                 if (pglen == 0) {
 1658                         if (++i == m->m_epg_npgs) {
 1659                                 m->m_epg_last_len = PAGE_SIZE;
 1660                                 mbufsiz = min(mlen, len);
 1661                                 m->m_next = mb_alloc_ext_plus_pages(mbufsiz,
 1662                                     how);
 1663                                 m = m->m_next;
 1664                                 if (m == NULL) {
 1665                                         m_freem(mout);
 1666                                         return (m);
 1667                                 }
 1668                                 i = 0;
 1669                         }
 1670                         pgpos = (char *)(void *)PHYS_TO_DMAP(m->m_epg_pa[i]);
 1671                         pglen = PAGE_SIZE;
 1672                 }
 1673                 while (mblen == 0) {
 1674                         if (mp == NULL) {
 1675                                 m_freem(mout);
 1676                                 return (NULL);
 1677                         }
 1678                         KASSERT((mp->m_flags & M_EXTPG) == 0,
 1679                             ("mb_copym_ext_pgs: ext_pgs input mbuf"));
 1680                         mbpos = mtod(mp, char *);
 1681                         mblen = mp->m_len;
 1682                         mp = mp->m_next;
 1683                 }
 1684                 xfer = min(mblen, pglen);
 1685                 memcpy(pgpos, mbpos, xfer);
 1686                 pgpos += xfer;
 1687                 mbpos += xfer;
 1688                 pglen -= xfer;
 1689                 mblen -= xfer;
 1690                 len -= xfer;
 1691                 m->m_len += xfer;
 1692         } while (len > 0);
 1693         m->m_epg_last_len = PAGE_SIZE - pglen;
 1694         if (mlast != NULL)
 1695                 *mlast = m;
 1696         return (mout);
 1697 }
Cache object: 98b0e973bbc03e8b4680a6181a180047
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/kern_mbuf.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_mbuf.c