FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_mbuf.c
1 /*-
2 * Copyright (c) 2001, 2002, 2003
3 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD: releng/5.1/sys/kern/subr_mbuf.c 115789 2003-06-03 23:27:05Z bmilekic $
29 */
30
31 #include "opt_mac.h"
32 #include "opt_param.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mac.h>
38 #include <sys/mbuf.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/condvar.h>
42 #include <sys/smp.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47
48 #include <vm/vm.h>
49 #include <vm/vm_kern.h>
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53
54 /*
55 * mb_alloc: network buffer allocator
56 *
57 * XXX: currently, the "low watermark" sysctl is marked read-only as its
58 * effects are not completely implemented. To be fixed soon.
59 */
60
61 /*
62 * Maximum number of PCPU containers. If you know what you're doing you could
63 * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
64 * system during compilation, and thus prevent kernel structure bloat.
65 *
66 * SMP and non-SMP kernels clearly have a different number of possible CPUs,
67 * but because we cannot assume a dense array of CPUs, we always allocate
68 * and traverse PCPU containers up to NCPU amount and merely check for
69 * CPU availability.
70 */
71 #ifdef MBALLOC_NCPU
72 #define NCPU MBALLOC_NCPU
73 #else
74 #define NCPU MAXCPU
75 #endif
76
77 /*-
78 * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
79 * "memcache" proof-of-concept allocator which was itself based on
80 * several well-known SMP-friendly allocators.
81 *
82 * The mb_alloc mbuf allocator is a special when compared to other
83 * general-purpose allocators. Some things to take note of:
84 *
85 * Mbufs and mbuf clusters are two different objects. Sometimes we
86 * will allocate a single mbuf, other times a single cluster,
87 * other times both. Further, we may sometimes wish to allocate a
88 * whole chain of mbufs with clusters. This allocator will perform
89 * the common case of each scenario in one function call (this
90 * includes constructing or destructing the object) while only
91 * locking/unlocking the cache once, if it can get away with it.
92 * The caches consist of pure mbufs and pure clusters; that is
93 * there are no 'zones' containing mbufs with already pre-hooked
94 * clusters. Since we can allocate both objects atomically anyway,
95 * we don't bother fragmenting our caches for any particular 'scenarios.'
96 *
97 * We allocate from seperate sub-maps of kmem_map, thus imposing
98 * an ultimate upper-limit on the number of allocatable clusters
99 * and mbufs and also, since the clusters all come from a
100 * virtually contiguous region, we can keep reference counters
101 * for them and "allocate" them purely by indexing into a
102 * dense refcount vector.
103 *
104 * We call out to protocol drain routines (which can be hooked
105 * into us) when we're low on space.
106 *
107 * The mbuf allocator keeps all objects that it allocates in mb_buckets.
108 * The buckets keep a number of objects (an object can be an mbuf or an
109 * mbuf cluster) and facilitate moving larger sets of contiguous objects
110 * from the per-CPU caches to the global cache. The buckets also have
111 * the added advantage that objects, when migrated from cache to cache,
112 * are migrated in chunks that keep contiguous objects together,
113 * minimizing TLB pollution.
114 *
115 * The buckets are kept on singly-linked lists called "containers." A container
116 * is protected by a mutex in order to ensure consistency. The mutex
117 * itself is allocated separately and attached to the container at boot time,
118 * thus allowing for certain containers to share the same lock. Per-CPU
119 * containers for mbufs and mbuf clusters all share the same per-CPU
120 * lock whereas the global cache containers for these objects share one
121 * global lock.
122 */
123 struct mb_bucket {
124 SLIST_ENTRY(mb_bucket) mb_blist;
125 int mb_owner;
126 int mb_numfree;
127 void *mb_free[0];
128 };
129
130 struct mb_container {
131 SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
132 struct mtx *mc_lock;
133 int mc_numowner;
134 u_int mc_starved;
135 long *mc_types;
136 u_long *mc_objcount;
137 u_long *mc_numbucks;
138 };
139
140 struct mb_gen_list {
141 struct mb_container mb_cont;
142 struct cv mgl_mstarved;
143 };
144
145 struct mb_pcpu_list {
146 struct mb_container mb_cont;
147 };
148
149 /*
150 * Boot-time configurable object counts that will determine the maximum
151 * number of permitted objects in the mbuf and mcluster cases. In the
152 * ext counter (nmbcnt) case, it's just an indicator serving to scale
153 * kmem_map size properly - in other words, we may be allowed to allocate
154 * more than nmbcnt counters, whereas we will never be allowed to allocate
155 * more than nmbufs mbufs or nmbclusters mclusters.
156 * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
157 * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
158 */
159 #ifndef NMBCLUSTERS
160 #define NMBCLUSTERS (1024 + maxusers * 64)
161 #endif
162 #ifndef NMBUFS
163 #define NMBUFS (nmbclusters * 2)
164 #endif
165 #ifndef NSFBUFS
166 #define NSFBUFS (512 + maxusers * 16)
167 #endif
168 #ifndef NMBCNTS
169 #define NMBCNTS (nmbclusters + nsfbufs)
170 #endif
171 int nmbufs;
172 int nmbclusters;
173 int nmbcnt;
174 int nsfbufs;
175
176 /*
177 * Sizes of objects per bucket. There are this size's worth of mbufs
178 * or clusters in each bucket. Please keep these a power-of-2.
179 */
180 #define MBUF_BUCK_SZ (PAGE_SIZE * 2)
181 #define CLUST_BUCK_SZ (PAGE_SIZE * 4)
182
183 /*
184 * Perform sanity checks of tunables declared above.
185 */
186 static void
187 tunable_mbinit(void *dummy)
188 {
189
190 /*
191 * This has to be done before VM init.
192 */
193 nmbclusters = NMBCLUSTERS;
194 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
195 nmbufs = NMBUFS;
196 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
197 nsfbufs = NSFBUFS;
198 TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
199 nmbcnt = NMBCNTS;
200 TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
201 /* Sanity checks */
202 if (nmbufs < nmbclusters * 2)
203 nmbufs = nmbclusters * 2;
204 if (nmbcnt < nmbclusters + nsfbufs)
205 nmbcnt = nmbclusters + nsfbufs;
206 }
207 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
208
209 /*
210 * The freelist structures and mutex locks. The number statically declared
211 * here depends on the number of CPUs.
212 *
213 * We set up in such a way that all the objects (mbufs, clusters)
214 * share the same mutex lock. It has been established that we do not benefit
215 * from different locks for different objects, so we use the same lock,
216 * regardless of object type. This also allows us to do optimised
217 * multi-object allocations without dropping the lock in between.
218 */
219 struct mb_lstmngr {
220 struct mb_gen_list *ml_genlist;
221 struct mb_pcpu_list *ml_cntlst[NCPU];
222 struct mb_bucket **ml_btable;
223 vm_map_t ml_map;
224 vm_offset_t ml_mapbase;
225 vm_offset_t ml_maptop;
226 int ml_mapfull;
227 u_int ml_objsize;
228 u_int ml_objbucks;
229 u_int *ml_wmhigh;
230 u_int *ml_wmlow;
231 };
232 static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
233 static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
234 static u_int *cl_refcntmap;
235
236 /*
237 * Local macros for internal allocator structure manipulations.
238 */
239 #ifdef SMP
240 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
241 #else
242 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
243 #endif
244
245 #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
246
247 #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
248
249 #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
250
251 #define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
252 (mb_lst)->ml_cntlst[(num)]
253
254 #define MB_BUCKET_INDX(mb_obj, mb_lst) \
255 (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \
256 ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
257
258 #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
259 { \
260 struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
261 \
262 (mb_bckt)->mb_numfree--; \
263 (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
264 (*((mb_lst)->mb_cont.mc_objcount))--; \
265 if ((mb_bckt)->mb_numfree == 0) { \
266 SLIST_REMOVE_HEAD(_mchd, mb_blist); \
267 SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
268 (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
269 } \
270 }
271
272 #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
273 (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
274 (mb_bckt)->mb_numfree++; \
275 (*((mb_lst)->mb_cont.mc_objcount))++;
276
277 #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
278 if ((mb_type) != MT_NOTMBUF) \
279 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
280
281 #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
282 if ((mb_type) != MT_NOTMBUF) \
283 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
284
285 /*
286 * Ownership of buckets/containers is represented by integers. The PCPU
287 * lists range from 0 to NCPU-1. We need a free numerical id for the general
288 * list (we use NCPU). We also need a non-conflicting free bit to indicate
289 * that the bucket is free and removed from a container, while not losing
290 * the bucket's originating container id. We use the highest bit
291 * for the free marker.
292 */
293 #define MB_GENLIST_OWNER (NCPU)
294 #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
295
296 /* Statistics structures for allocator (per-CPU and general). */
297 static struct mbpstat mb_statpcpu[NCPU + 1];
298 struct mbstat mbstat;
299
300 /* Sleep time for wait code (in ticks). */
301 static int mbuf_wait = 64;
302
303 static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */
304 static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */
305 static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */
306 static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */
307
308 /*
309 * Objects exported by sysctl(8).
310 */
311 SYSCTL_DECL(_kern_ipc);
312 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
313 "Maximum number of mbuf clusters available");
314 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
315 "Maximum number of mbufs available");
316 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
317 "Number used to scale kmem_map to ensure sufficient space for counters");
318 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
319 "Maximum number of sendfile(2) sf_bufs available");
320 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
321 "Sleep time of mbuf subsystem wait allocations during exhaustion");
322 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
323 "Upper limit of number of mbufs allowed in each cache");
324 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0,
325 "Lower limit of number of mbufs allowed in each cache");
326 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
327 "Upper limit of number of mbuf clusters allowed in each cache");
328 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0,
329 "Lower limit of number of mbuf clusters allowed in each cache");
330 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
331 "Mbuf general information and statistics");
332 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
333 sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
334
335 /*
336 * Prototypes of local allocator routines.
337 */
338 static void *mb_alloc_wait(struct mb_lstmngr *, short);
339 static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
340 struct mb_pcpu_list *);
341 static void mb_reclaim(void);
342 static void mbuf_init(void *);
343
344 /*
345 * Initial allocation numbers. Each parameter represents the number of buckets
346 * of each object that will be placed initially in each PCPU container for
347 * said object.
348 */
349 #define NMB_MBUF_INIT 2
350 #define NMB_CLUST_INIT 8
351
352 /*
353 * Internal flags that allow for cache locks to remain "persistent" across
354 * allocation and free calls. They may be used in combination.
355 */
356 #define MBP_PERSIST 0x1 /* Return with lock still held. */
357 #define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */
358
359 /*
360 * Initialize the mbuf subsystem.
361 *
362 * We sub-divide the kmem_map into several submaps; this way, we don't have
363 * to worry about artificially limiting the number of mbuf or mbuf cluster
364 * allocations, due to fear of one type of allocation "stealing" address
365 * space initially reserved for another.
366 *
367 * Set up both the general containers and all the PCPU containers. Populate
368 * the PCPU containers with initial numbers.
369 */
370 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
371 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
372 static void
373 mbuf_init(void *dummy)
374 {
375 struct mb_pcpu_list *pcpu_cnt;
376 vm_size_t mb_map_size;
377 int i, j;
378
379 /*
380 * Set up all the submaps, for each type of object that we deal
381 * with in this allocator.
382 */
383 mb_map_size = (vm_size_t)(nmbufs * MSIZE);
384 mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
385 mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
386 MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
387 if (mb_list_mbuf.ml_btable == NULL)
388 goto bad;
389 mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
390 &(mb_list_mbuf.ml_maptop), mb_map_size);
391 mb_list_mbuf.ml_map->system_map = 1;
392 mb_list_mbuf.ml_mapfull = 0;
393 mb_list_mbuf.ml_objsize = MSIZE;
394 mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / MSIZE;
395 mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
396 mb_list_mbuf.ml_wmlow = &mbuf_lowm;
397
398 mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
399 mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
400 mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
401 CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
402 if (mb_list_clust.ml_btable == NULL)
403 goto bad;
404 mb_list_clust.ml_map = kmem_suballoc(kmem_map,
405 &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
406 mb_map_size);
407 mb_list_clust.ml_map->system_map = 1;
408 mb_list_clust.ml_mapfull = 0;
409 mb_list_clust.ml_objsize = MCLBYTES;
410 mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / MCLBYTES;
411 mb_list_clust.ml_wmhigh = &clust_hiwm;
412 mb_list_clust.ml_wmlow = &clust_lowm;
413
414 /*
415 * Allocate required general (global) containers for each object type.
416 */
417 mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
418 M_NOWAIT);
419 mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
420 M_NOWAIT);
421 if ((mb_list_mbuf.ml_genlist == NULL) ||
422 (mb_list_clust.ml_genlist == NULL))
423 goto bad;
424
425 /*
426 * Initialize condition variables and general container mutex locks.
427 */
428 mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
429 cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
430 cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
431 "mcluster pool starved");
432 mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
433 mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
434
435 /*
436 * Set up the general containers for each object.
437 */
438 mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
439 mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
440 mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
441 mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
442 mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
443 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
444 mb_list_clust.ml_genlist->mb_cont.mc_objcount =
445 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
446 mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
447 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
448 mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
449 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
450 mb_list_mbuf.ml_genlist->mb_cont.mc_types =
451 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
452 mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
453 SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
454 SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
455
456 /*
457 * Allocate all the required counters for clusters. This makes
458 * cluster allocations/deallocations much faster.
459 */
460 cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
461 if (cl_refcntmap == NULL)
462 goto bad;
463
464 /*
465 * Initialize general mbuf statistics.
466 */
467 mbstat.m_msize = MSIZE;
468 mbstat.m_mclbytes = MCLBYTES;
469 mbstat.m_minclsize = MINCLSIZE;
470 mbstat.m_mlen = MLEN;
471 mbstat.m_mhlen = MHLEN;
472 mbstat.m_numtypes = MT_NTYPES;
473 mbstat.m_mbperbuck = MBUF_BUCK_SZ / MSIZE;
474 mbstat.m_clperbuck = CLUST_BUCK_SZ / MCLBYTES;
475
476 /*
477 * Allocate and initialize PCPU containers.
478 */
479 for (i = 0; i < NCPU; i++) {
480 if (CPU_ABSENT(i)) {
481 mb_statpcpu[i].mb_active = 0;
482 continue;
483 }
484
485 mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
486 M_MBUF, M_NOWAIT);
487 mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
488 M_MBUF, M_NOWAIT);
489 if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
490 (mb_list_clust.ml_cntlst[i] == NULL))
491 goto bad;
492
493 mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
494 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
495 mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
496
497 mb_statpcpu[i].mb_active = 1;
498 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
499 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
500 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
501 mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
502 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
503 &(mb_statpcpu[i].mb_mbfree);
504 mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
505 &(mb_statpcpu[i].mb_clfree);
506 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
507 &(mb_statpcpu[i].mb_mbbucks);
508 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
509 &(mb_statpcpu[i].mb_clbucks);
510 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
511 &(mb_statpcpu[i].mb_mbtypes[0]);
512 mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
513
514 SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
515 SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
516
517 /*
518 * Perform initial allocations.
519 */
520 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
521 MB_LOCK_CONT(pcpu_cnt);
522 for (j = 0; j < NMB_MBUF_INIT; j++) {
523 if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
524 == NULL)
525 goto bad;
526 }
527 MB_UNLOCK_CONT(pcpu_cnt);
528
529 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
530 MB_LOCK_CONT(pcpu_cnt);
531 for (j = 0; j < NMB_CLUST_INIT; j++) {
532 if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
533 == NULL)
534 goto bad;
535 }
536 MB_UNLOCK_CONT(pcpu_cnt);
537 }
538
539 return;
540 bad:
541 panic("mbuf_init(): failed to initialize mbuf subsystem!");
542 }
543
544 /*
545 * Populate a given mbuf PCPU container with a bucket full of fresh new
546 * buffers. Return a pointer to the new bucket (already in the container if
547 * successful), or return NULL on failure.
548 *
549 * LOCKING NOTES:
550 * PCPU container lock must be held when this is called.
551 * The lock is dropped here so that we can cleanly call the underlying VM
552 * code. If we fail, we return with no locks held. If we succeed (i.e., return
553 * non-NULL), we return with the PCPU lock held, ready for allocation from
554 * the returned bucket.
555 */
556 static struct mb_bucket *
557 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
558 {
559 struct mb_bucket *bucket;
560 caddr_t p;
561 int i;
562
563 MB_UNLOCK_CONT(cnt_lst);
564 /*
565 * If our object's (finite) map is starved now (i.e., no more address
566 * space), bail out now.
567 */
568 if (mb_list->ml_mapfull)
569 return (NULL);
570
571 bucket = malloc(sizeof(struct mb_bucket) +
572 mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how));
573 if (bucket == NULL)
574 return (NULL);
575
576 p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize *
577 mb_list->ml_objbucks, MBTOM(how));
578 if (p == NULL) {
579 free(bucket, M_MBUF);
580 if (how == M_TRYWAIT)
581 mb_list->ml_mapfull = 1;
582 return (NULL);
583 }
584
585 bucket->mb_numfree = 0;
586 mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
587 for (i = 0; i < mb_list->ml_objbucks; i++) {
588 bucket->mb_free[i] = p;
589 bucket->mb_numfree++;
590 p += mb_list->ml_objsize;
591 }
592
593 MB_LOCK_CONT(cnt_lst);
594 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
595 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
596 (*(cnt_lst->mb_cont.mc_numbucks))++;
597 *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
598
599 return (bucket);
600 }
601
602 /*
603 * Allocate a network buffer.
604 * The general case is very easy. Complications only arise if our PCPU
605 * container is empty. Things get worse if the PCPU container is empty,
606 * the general container is empty, and we've run out of address space
607 * in our map; then we try to block if we're willing to (M_TRYWAIT).
608 */
609 static __inline
610 void *
611 mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist,
612 int *pers_list)
613 {
614 static int last_report;
615 struct mb_pcpu_list *cnt_lst;
616 struct mb_bucket *bucket;
617 void *m;
618
619 #ifdef INVARIANTS
620 int flags;
621
622 flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
623 if (flags != M_DONTWAIT && flags != M_TRYWAIT) {
624 static struct timeval lasterr;
625 static int curerr;
626 if (ppsratecheck(&lasterr, &curerr, 1)) {
627 printf("Bad mbuf alloc flags: %x\n", flags);
628 backtrace();
629 how = M_TRYWAIT;
630 }
631 }
632 if ((flags & M_DONTWAIT) == 0)
633 GIANT_REQUIRED;
634 #endif
635
636 m = NULL;
637 if ((persist & MBP_PERSISTENT) != 0) {
638 /*
639 * If we're a "persistent" call, then the per-CPU #(pers_list)
640 * cache lock is already held, and we just need to refer to
641 * the correct cache descriptor.
642 */
643 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
644 } else {
645 cnt_lst = MB_GET_PCPU_LIST(mb_list);
646 MB_LOCK_CONT(cnt_lst);
647 }
648
649 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
650 /*
651 * This is the easy allocation case. We just grab an object
652 * from a bucket in the PCPU container. At worst, we
653 * have just emptied the bucket and so we remove it
654 * from the container.
655 */
656 MB_GET_OBJECT(m, bucket, cnt_lst);
657 MB_MBTYPES_INC(cnt_lst, type, 1);
658
659 /* If asked to persist, do not drop the lock. */
660 if ((persist & MBP_PERSIST) == 0)
661 MB_UNLOCK_CONT(cnt_lst);
662 else
663 *pers_list = cnt_lst->mb_cont.mc_numowner;
664 } else {
665 struct mb_gen_list *gen_list;
666
667 /*
668 * This is the less-common more difficult case. We must
669 * first verify if the general list has anything for us
670 * and if that also fails, we must allocate a page from
671 * the map and create a new bucket to place in our PCPU
672 * container (already locked). If the map is starved then
673 * we're really in for trouble, as we have to wait on
674 * the general container's condition variable.
675 */
676 gen_list = MB_GET_GEN_LIST(mb_list);
677 MB_LOCK_CONT(gen_list);
678
679 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
680 != NULL) {
681 /*
682 * Give ownership of the bucket to our CPU's
683 * container, but only actually put the bucket
684 * in the container if it doesn't become free
685 * upon removing an mbuf from it.
686 */
687 SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
688 mb_blist);
689 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
690 (*(gen_list->mb_cont.mc_numbucks))--;
691 (*(cnt_lst->mb_cont.mc_numbucks))++;
692 *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
693 bucket->mb_numfree--;
694 m = bucket->mb_free[(bucket->mb_numfree)];
695 if (bucket->mb_numfree == 0) {
696 SLIST_NEXT(bucket, mb_blist) = NULL;
697 bucket->mb_owner |= MB_BUCKET_FREE;
698 } else {
699 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
700 bucket, mb_blist);
701 *(cnt_lst->mb_cont.mc_objcount) +=
702 bucket->mb_numfree;
703 }
704 MB_UNLOCK_CONT(gen_list);
705 MB_MBTYPES_INC(cnt_lst, type, 1);
706
707 /* If asked to persist, do not drop the lock. */
708 if ((persist & MBP_PERSIST) == 0)
709 MB_UNLOCK_CONT(cnt_lst);
710 else
711 *pers_list = cnt_lst->mb_cont.mc_numowner;
712 } else {
713 /*
714 * We'll have to allocate a new page.
715 */
716 MB_UNLOCK_CONT(gen_list);
717 bucket = mb_pop_cont(mb_list, how, cnt_lst);
718 if (bucket != NULL) {
719 MB_GET_OBJECT(m, bucket, cnt_lst);
720 MB_MBTYPES_INC(cnt_lst, type, 1);
721
722 /* If asked to persist, do not drop the lock. */
723 if ((persist & MBP_PERSIST) == 0)
724 MB_UNLOCK_CONT(cnt_lst);
725 else
726 *pers_list=cnt_lst->mb_cont.mc_numowner;
727 } else {
728 if (how == M_TRYWAIT) {
729 /*
730 * Absolute worst-case scenario.
731 * We block if we're willing to, but
732 * only after trying to steal from
733 * other lists.
734 */
735 m = mb_alloc_wait(mb_list, type);
736 } else {
737 /* XXX: No consistency. */
738 mbstat.m_drops++;
739
740 if (ticks < last_report ||
741 (ticks - last_report) >= hz) {
742 last_report = ticks;
743 printf(
744 "All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
745 }
746
747 }
748 if (m != NULL && (persist & MBP_PERSIST) != 0) {
749 cnt_lst = MB_GET_PCPU_LIST(mb_list);
750 MB_LOCK_CONT(cnt_lst);
751 *pers_list=cnt_lst->mb_cont.mc_numowner;
752 }
753 }
754 }
755 }
756
757 return (m);
758 }
759
760 /*
761 * This is the worst-case scenario called only if we're allocating with
762 * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
763 * by looking in every PCPU container. If we're still unsuccesful, we
764 * try the general container one last time and possibly block on our
765 * starved cv.
766 */
767 static void *
768 mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
769 {
770 struct mb_pcpu_list *cnt_lst;
771 struct mb_gen_list *gen_list;
772 struct mb_bucket *bucket;
773 void *m;
774 int i, cv_ret;
775
776 /*
777 * Try to reclaim mbuf-related objects (mbufs, clusters).
778 */
779 mb_reclaim();
780
781 /*
782 * Cycle all the PCPU containers. Increment starved counts if found
783 * empty.
784 */
785 for (i = 0; i < NCPU; i++) {
786 if (CPU_ABSENT(i))
787 continue;
788 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
789 MB_LOCK_CONT(cnt_lst);
790
791 /*
792 * If container is non-empty, get a single object from it.
793 * If empty, increment starved count.
794 */
795 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
796 NULL) {
797 MB_GET_OBJECT(m, bucket, cnt_lst);
798 MB_MBTYPES_INC(cnt_lst, type, 1);
799 MB_UNLOCK_CONT(cnt_lst);
800 mbstat.m_wait++; /* XXX: No consistency. */
801 return (m);
802 } else
803 cnt_lst->mb_cont.mc_starved++;
804
805 MB_UNLOCK_CONT(cnt_lst);
806 }
807
808 /*
809 * We're still here, so that means it's time to get the general
810 * container lock, check it one more time (now that mb_reclaim()
811 * has been called) and if we still get nothing, block on the cv.
812 */
813 gen_list = MB_GET_GEN_LIST(mb_list);
814 MB_LOCK_CONT(gen_list);
815 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
816 MB_GET_OBJECT(m, bucket, gen_list);
817 MB_MBTYPES_INC(gen_list, type, 1);
818 MB_UNLOCK_CONT(gen_list);
819 mbstat.m_wait++; /* XXX: No consistency. */
820 return (m);
821 }
822
823 gen_list->mb_cont.mc_starved++;
824 cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
825 gen_list->mb_cont.mc_lock, mbuf_wait);
826 gen_list->mb_cont.mc_starved--;
827
828 if ((cv_ret == 0) &&
829 ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
830 MB_GET_OBJECT(m, bucket, gen_list);
831 MB_MBTYPES_INC(gen_list, type, 1);
832 mbstat.m_wait++; /* XXX: No consistency. */
833 } else {
834 mbstat.m_drops++; /* XXX: No consistency. */
835 m = NULL;
836 }
837
838 MB_UNLOCK_CONT(gen_list);
839
840 return (m);
841 }
842
843 /*-
844 * Free an object to its rightful container.
845 * In the very general case, this operation is really very easy.
846 * Complications arise primarily if:
847 * (a) We've hit the high limit on number of free objects allowed in
848 * our PCPU container.
849 * (b) We're in a critical situation where our container has been
850 * marked 'starved' and we need to issue wakeups on the starved
851 * condition variable.
852 * (c) Minor (odd) cases: our bucket has migrated while we were
853 * waiting for the lock; our bucket is in the general container;
854 * our bucket is empty.
855 */
856 static __inline
857 void
858 mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
859 int *pers_list)
860 {
861 struct mb_pcpu_list *cnt_lst;
862 struct mb_gen_list *gen_list;
863 struct mb_bucket *bucket;
864 u_int owner;
865
866 bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
867
868 /*
869 * Make sure that if after we lock the bucket's present container the
870 * bucket has migrated, that we drop the lock and get the new one.
871 */
872 retry_lock:
873 owner = bucket->mb_owner & ~MB_BUCKET_FREE;
874 switch (owner) {
875 case MB_GENLIST_OWNER:
876 gen_list = MB_GET_GEN_LIST(mb_list);
877 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
878 if (*pers_list != MB_GENLIST_OWNER) {
879 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
880 *pers_list);
881 MB_UNLOCK_CONT(cnt_lst);
882 MB_LOCK_CONT(gen_list);
883 }
884 } else {
885 MB_LOCK_CONT(gen_list);
886 }
887 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
888 MB_UNLOCK_CONT(gen_list);
889 *pers_list = -1;
890 goto retry_lock;
891 }
892
893 /*
894 * If we're intended for the general container, this is
895 * real easy: no migrating required. The only `bogon'
896 * is that we're now contending with all the threads
897 * dealing with the general list, but this is expected.
898 */
899 MB_PUT_OBJECT(m, bucket, gen_list);
900 MB_MBTYPES_DEC(gen_list, type, 1);
901 if (bucket->mb_owner & MB_BUCKET_FREE) {
902 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
903 bucket, mb_blist);
904 bucket->mb_owner = MB_GENLIST_OWNER;
905 }
906 if (gen_list->mb_cont.mc_starved > 0)
907 cv_signal(&(gen_list->mgl_mstarved));
908 if ((persist & MBP_PERSIST) == 0)
909 MB_UNLOCK_CONT(gen_list);
910 else
911 *pers_list = MB_GENLIST_OWNER;
912 break;
913
914 default:
915 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
916 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
917 if (*pers_list == MB_GENLIST_OWNER) {
918 gen_list = MB_GET_GEN_LIST(mb_list);
919 MB_UNLOCK_CONT(gen_list);
920 MB_LOCK_CONT(cnt_lst);
921 } else {
922 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
923 *pers_list);
924 owner = *pers_list;
925 }
926 } else {
927 MB_LOCK_CONT(cnt_lst);
928 }
929 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
930 MB_UNLOCK_CONT(cnt_lst);
931 *pers_list = -1;
932 goto retry_lock;
933 }
934
935 MB_PUT_OBJECT(m, bucket, cnt_lst);
936 MB_MBTYPES_DEC(cnt_lst, type, 1);
937 if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) ||
938 (cnt_lst->mb_cont.mc_starved > 0)) {
939 /*
940 * We've hit the high limit of allowed numbers of mbufs
941 * on this PCPU list or we've been flagged that we need
942 * to transfer a bucket over to the general cache.
943 * We must now migrate a bucket over to the general
944 * container.
945 */
946 gen_list = MB_GET_GEN_LIST(mb_list);
947 MB_LOCK_CONT(gen_list);
948 if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
949 bucket =
950 SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
951 SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
952 mb_blist);
953 }
954 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
955 bucket, mb_blist);
956 bucket->mb_owner = MB_GENLIST_OWNER;
957 *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
958 *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
959 (*(cnt_lst->mb_cont.mc_numbucks))--;
960 (*(gen_list->mb_cont.mc_numbucks))++;
961
962 /*
963 * While we're at it, transfer some of the mbtypes
964 * "count load" onto the general list's mbtypes
965 * array, seeing as how we're moving the bucket
966 * there now, meaning that the freeing of objects
967 * there will now decrement the _general list's_
968 * mbtypes counters, and no longer our PCPU list's
969 * mbtypes counters. We do this for the type presently
970 * being freed in an effort to keep the mbtypes
971 * counters approximately balanced across all lists.
972 */
973 MB_MBTYPES_DEC(cnt_lst, type,
974 mb_list->ml_objbucks - bucket->mb_numfree);
975 MB_MBTYPES_INC(gen_list, type,
976 mb_list->ml_objbucks - bucket->mb_numfree);
977
978 if (cnt_lst->mb_cont.mc_starved > 0) {
979 /*
980 * Determine whether or not to keep
981 * transferring buckets to the general list
982 * or whether we've transferred enough already.
983 * The thread that is blocked may end up waking
984 * up in the meantime, but transferring an
985 * extra bucket in a constrained situation
986 * is not so bad, as we're likely to need
987 * it soon anyway.
988 */
989 if (gen_list->mb_cont.mc_starved > 0) {
990 cnt_lst->mb_cont.mc_starved--;
991 cv_signal(&(gen_list->mgl_mstarved));
992 } else
993 cnt_lst->mb_cont.mc_starved = 0;
994 }
995 MB_UNLOCK_CONT(gen_list);
996 if ((persist & MBP_PERSIST) == 0)
997 MB_UNLOCK_CONT(cnt_lst);
998 else
999 *pers_list = owner;
1000 break;
1001 }
1002
1003 if (bucket->mb_owner & MB_BUCKET_FREE) {
1004 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
1005 bucket, mb_blist);
1006 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
1007 }
1008
1009 if ((persist & MBP_PERSIST) == 0)
1010 MB_UNLOCK_CONT(cnt_lst);
1011 else
1012 *pers_list = owner;
1013 break;
1014 }
1015 }
1016
1017 /*
1018 * Drain protocols in hopes to free up some resources.
1019 *
1020 * LOCKING NOTES:
1021 * No locks should be held when this is called. The drain routines have to
1022 * presently acquire some locks which raises the possibility of lock order
1023 * violation if we're holding any mutex if that mutex is acquired in reverse
1024 * order relative to one of the locks in the drain routines.
1025 */
1026 static void
1027 mb_reclaim(void)
1028 {
1029 struct domain *dp;
1030 struct protosw *pr;
1031
1032 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
1033 "mb_reclaim()");
1034
1035 mbstat.m_drain++; /* XXX: No consistency. */
1036
1037 for (dp = domains; dp != NULL; dp = dp->dom_next)
1038 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
1039 if (pr->pr_drain != NULL)
1040 (*pr->pr_drain)();
1041 }
1042
1043 /******************************************************************************
1044 * Internal setup macros.
1045 */
1046
1047 #define _mb_setup(m, type) do { \
1048 (m)->m_type = (type); \
1049 (m)->m_next = NULL; \
1050 (m)->m_nextpkt = NULL; \
1051 (m)->m_data = (m)->m_dat; \
1052 (m)->m_flags = 0; \
1053 } while (0)
1054
1055 #define _mbhdr_setup(m, type) do { \
1056 (m)->m_type = (type); \
1057 (m)->m_next = NULL; \
1058 (m)->m_nextpkt = NULL; \
1059 (m)->m_data = (m)->m_pktdat; \
1060 (m)->m_flags = M_PKTHDR; \
1061 (m)->m_pkthdr.rcvif = NULL; \
1062 (m)->m_pkthdr.csum_flags = 0; \
1063 SLIST_INIT(&(m)->m_pkthdr.tags); \
1064 } while (0)
1065
1066 #define _mcl_setup(m) do { \
1067 (m)->m_data = (m)->m_ext.ext_buf; \
1068 (m)->m_flags |= M_EXT; \
1069 (m)->m_ext.ext_free = NULL; \
1070 (m)->m_ext.ext_args = NULL; \
1071 (m)->m_ext.ext_size = MCLBYTES; \
1072 (m)->m_ext.ext_type = EXT_CLUSTER; \
1073 } while (0)
1074
1075 #define _mext_init_ref(m, ref) do { \
1076 (m)->m_ext.ref_cnt = ((ref) == NULL) ? \
1077 malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \
1078 if ((m)->m_ext.ref_cnt != NULL) { \
1079 *((m)->m_ext.ref_cnt) = 0; \
1080 MEXT_ADD_REF((m)); \
1081 } \
1082 } while (0)
1083
1084 #define cl2ref(cl) \
1085 (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT)
1086
1087 #define _mext_dealloc_ref(m) \
1088 if ((m)->m_ext.ext_type != EXT_EXTREF) \
1089 free((m)->m_ext.ref_cnt, M_MBUF)
1090
1091 /******************************************************************************
1092 * Internal routines.
1093 *
1094 * Because mb_alloc() and mb_free() are inlines (to keep the common
1095 * cases down to a maximum of one function call), below are a few
1096 * routines used only internally for the sole purpose of making certain
1097 * functions smaller.
1098 *
1099 * - _mext_free(): frees associated storage when the ref. count is
1100 * exactly one and we're freeing.
1101 *
1102 * - _mgetm_internal(): common "persistent-lock" routine that allocates
1103 * an mbuf and a cluster in one shot, but where the lock is already
1104 * held coming in (which is what makes it different from the exported
1105 * m_getcl()). The lock is dropped when done. This is used by m_getm()
1106 * and, therefore, is very m_getm()-specific.
1107 */
1108 static struct mbuf *_mgetm_internal(int, short, short, int);
1109
1110 void
1111 _mext_free(struct mbuf *mb)
1112 {
1113
1114 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1115 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1116 0, NULL);
1117 } else {
1118 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
1119 _mext_dealloc_ref(mb);
1120 }
1121 }
1122
1123 static struct mbuf *
1124 _mgetm_internal(int how, short type, short persist, int cchnum)
1125 {
1126 struct mbuf *mb;
1127
1128 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
1129 if (mb == NULL)
1130 return NULL;
1131 _mb_setup(mb, type);
1132
1133 if ((persist & MBP_PERSIST) != 0) {
1134 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1135 how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1136 if (mb->m_ext.ext_buf == NULL) {
1137 (void)m_free(mb);
1138 mb = NULL;
1139 }
1140 _mcl_setup(mb);
1141 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1142 }
1143 return (mb);
1144 }
1145
1146 /******************************************************************************
1147 * Exported buffer allocation and de-allocation routines.
1148 */
1149
1150 /*
1151 * Allocate and return a single (normal) mbuf. NULL is returned on failure.
1152 *
1153 * Arguments:
1154 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1155 * if really starved for memory. M_DONTWAIT to never block.
1156 * - type: the type of the mbuf being allocated.
1157 */
1158 struct mbuf *
1159 m_get(int how, short type)
1160 {
1161 struct mbuf *mb;
1162
1163 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1164 if (mb != NULL)
1165 _mb_setup(mb, type);
1166 return (mb);
1167 }
1168
1169 /*
1170 * Allocate a given length worth of mbufs and/or clusters (whatever fits
1171 * best) and return a pointer to the top of the allocated chain. If an
1172 * existing mbuf chain is provided, then we will append the new chain
1173 * to the existing one but still return the top of the newly allocated
1174 * chain. NULL is returned on failure, in which case the [optional]
1175 * provided chain is left untouched, and any memory already allocated
1176 * is freed.
1177 *
1178 * Arguments:
1179 * - m: existing chain to which to append new chain (optional).
1180 * - len: total length of data to append, either in mbufs or clusters
1181 * (we allocate whatever combination yields the best fit).
1182 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1183 * if really starved for memory. M_DONTWAIT to never block.
1184 * - type: the type of the mbuf being allocated.
1185 */
1186 struct mbuf *
1187 m_getm(struct mbuf *m, int len, int how, short type)
1188 {
1189 struct mbuf *mb, *top, *cur, *mtail;
1190 int num, rem, cchnum;
1191 short persist;
1192 int i;
1193
1194 KASSERT(len >= 0, ("m_getm(): len is < 0"));
1195
1196 /* If m != NULL, we will append to the end of that chain. */
1197 if (m != NULL)
1198 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
1199 else
1200 mtail = NULL;
1201
1202 /*
1203 * In the best-case scenario (which should be the common case
1204 * unless we're in a starvation situation), we will be able to
1205 * go through the allocation of all the desired mbufs and clusters
1206 * here without dropping our per-CPU cache lock in between.
1207 */
1208 num = len / MCLBYTES;
1209 rem = len % MCLBYTES;
1210 persist = 0;
1211 cchnum = -1;
1212 top = cur = NULL;
1213 for (i = 0; i < num; i++) {
1214 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1215 MBP_PERSIST | persist, &cchnum);
1216 if (mb == NULL)
1217 goto failed;
1218 _mb_setup(mb, type);
1219 mb->m_len = 0;
1220
1221 persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
1222 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1223 how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
1224 if (mb->m_ext.ext_buf == NULL) {
1225 (void)m_free(mb);
1226 goto failed;
1227 }
1228 _mcl_setup(mb);
1229 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1230 persist = MBP_PERSISTENT;
1231
1232 if (cur == NULL)
1233 top = cur = mb;
1234 else
1235 cur = (cur->m_next = mb);
1236 }
1237 if (rem > 0) {
1238 if (cchnum >= 0) {
1239 persist = MBP_PERSISTENT;
1240 persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
1241 mb = _mgetm_internal(how, type, persist, cchnum);
1242 if (mb == NULL)
1243 goto failed;
1244 } else if (rem > MINCLSIZE) {
1245 mb = m_getcl(how, type, 0);
1246 } else {
1247 mb = m_get(how, type);
1248 }
1249 if (mb != NULL) {
1250 mb->m_len = 0;
1251 if (cur == NULL)
1252 top = mb;
1253 else
1254 cur->m_next = mb;
1255 } else
1256 goto failed;
1257 }
1258
1259 if (mtail != NULL)
1260 mtail->m_next = top;
1261 return top;
1262 failed:
1263 if (top != NULL)
1264 m_freem(top);
1265 return NULL;
1266 }
1267
1268 /*
1269 * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure.
1270 *
1271 * Arguments:
1272 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1273 * if really starved for memory. M_DONTWAIT to never block.
1274 * - type: the type of the mbuf being allocated.
1275 */
1276 struct mbuf *
1277 m_gethdr(int how, short type)
1278 {
1279 struct mbuf *mb;
1280
1281 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1282 if (mb != NULL) {
1283 _mbhdr_setup(mb, type);
1284 #ifdef MAC
1285 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1286 m_free(mb);
1287 return (NULL);
1288 }
1289 #endif
1290 }
1291 return (mb);
1292 }
1293
1294 /*
1295 * Allocate and return a single (normal) pre-zero'd mbuf. NULL is
1296 * returned on failure.
1297 *
1298 * Arguments:
1299 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1300 * if really starved for memory. M_DONTWAIT to never block.
1301 * - type: the type of the mbuf being allocated.
1302 */
1303 struct mbuf *
1304 m_get_clrd(int how, short type)
1305 {
1306 struct mbuf *mb;
1307
1308 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1309 if (mb != NULL) {
1310 _mb_setup(mb, type);
1311 bzero(mtod(mb, caddr_t), MLEN);
1312 }
1313 return (mb);
1314 }
1315
1316 /*
1317 * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is
1318 * returned on failure.
1319 *
1320 * Arguments:
1321 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1322 * if really starved for memory. M_DONTWAIT to never block.
1323 * - type: the type of the mbuf being allocated.
1324 */
1325 struct mbuf *
1326 m_gethdr_clrd(int how, short type)
1327 {
1328 struct mbuf *mb;
1329
1330 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1331 if (mb != NULL) {
1332 _mbhdr_setup(mb, type);
1333 #ifdef MAC
1334 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1335 m_free(mb);
1336 return (NULL);
1337 }
1338 #endif
1339 bzero(mtod(mb, caddr_t), MHLEN);
1340 }
1341 return (mb);
1342 }
1343
1344 /*
1345 * Free a single mbuf and any associated storage that it may have attached
1346 * to it. The associated storage may not be immediately freed if its
1347 * reference count is above 1. Returns the next mbuf in the chain following
1348 * the mbuf being freed.
1349 *
1350 * Arguments:
1351 * - mb: the mbuf to free.
1352 */
1353 struct mbuf *
1354 m_free(struct mbuf *mb)
1355 {
1356 struct mbuf *nb;
1357 int cchnum;
1358 short persist = 0;
1359
1360 #ifdef INVARIANTS
1361 if (mb->m_flags & M_FREELIST)
1362 panic("m_free detected a mbuf double-free");
1363 mb->m_flags |= M_FREELIST;
1364 #endif
1365 if ((mb->m_flags & M_PKTHDR) != 0)
1366 m_tag_delete_chain(mb, NULL);
1367 nb = mb->m_next;
1368 if ((mb->m_flags & M_EXT) != 0) {
1369 MEXT_REM_REF(mb);
1370 if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
1371 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1372 mb_free(&mb_list_clust,
1373 (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1374 MBP_PERSIST, &cchnum);
1375 persist = MBP_PERSISTENT;
1376 } else {
1377 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
1378 mb->m_ext.ext_args);
1379 _mext_dealloc_ref(mb);
1380 persist = 0;
1381 }
1382 }
1383 }
1384 mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
1385 return (nb);
1386 }
1387
1388 /*
1389 * Free an entire chain of mbufs and associated external buffers, if
1390 * applicable. Right now, we only optimize a little so that the cache
1391 * lock may be held across a single mbuf+cluster free. Hopefully,
1392 * we'll eventually be holding the lock across more than merely two
1393 * consecutive frees but right now this is hard to implement because of
1394 * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
1395 * loop.
1396 *
1397 * - mb: the mbuf chain to free.
1398 */
1399 void
1400 m_freem(struct mbuf *mb)
1401 {
1402
1403 while (mb != NULL)
1404 mb = m_free(mb);
1405 }
1406
1407 /*
1408 * Fetch an mbuf with a cluster attached to it. If one of the
1409 * allocations fails, the entire allocation fails. This routine is
1410 * the preferred way of fetching both the mbuf and cluster together,
1411 * as it avoids having to unlock/relock between allocations. Returns
1412 * NULL on failure.
1413 *
1414 * Arguments:
1415 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1416 * if really starved for memory. M_DONTWAIT to never block.
1417 * - type: the type of the mbuf being allocated.
1418 * - flags: any flags to pass to the mbuf being allocated; if this includes
1419 * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
1420 */
1421 struct mbuf *
1422 m_getcl(int how, short type, int flags)
1423 {
1424 struct mbuf *mb;
1425 int cchnum;
1426
1427 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1428 MBP_PERSIST, &cchnum);
1429 if (mb == NULL)
1430 return NULL;
1431 mb->m_type = type;
1432 mb->m_next = NULL;
1433 mb->m_flags = flags;
1434 if ((flags & M_PKTHDR) != 0) {
1435 mb->m_nextpkt = NULL;
1436 mb->m_pkthdr.rcvif = NULL;
1437 mb->m_pkthdr.csum_flags = 0;
1438 SLIST_INIT(&mb->m_pkthdr.tags);
1439 }
1440
1441 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
1442 MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1443 if (mb->m_ext.ext_buf == NULL) {
1444 (void)m_free(mb);
1445 mb = NULL;
1446 } else {
1447 _mcl_setup(mb);
1448 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1449 }
1450 #ifdef MAC
1451 if (flags & M_PKTHDR) {
1452 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1453 m_free(mb);
1454 return (NULL);
1455 }
1456 }
1457 #endif
1458 return (mb);
1459 }
1460
1461 /*
1462 * Fetch a single mbuf cluster and attach it to an existing mbuf. If
1463 * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
1464 * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
1465 * The M_EXT bit is not set on failure.
1466 *
1467 * Arguments:
1468 * - mb: the existing mbuf to which to attach the allocated cluster.
1469 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1470 * if really starved for memory. M_DONTWAIT to never block.
1471 */
1472 void
1473 m_clget(struct mbuf *mb, int how)
1474 {
1475
1476 mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
1477 0, NULL);
1478 if (mb->m_ext.ext_buf != NULL) {
1479 _mcl_setup(mb);
1480 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1481 }
1482 }
1483
1484 /*
1485 * Configure a provided mbuf to refer to the provided external storage
1486 * buffer and setup a reference count for said buffer. If the setting
1487 * up of the reference count fails, the M_EXT bit will not be set. If
1488 * successfull, the M_EXT bit is set in the mbuf's flags.
1489 *
1490 * Arguments:
1491 * - mb: the existing mbuf to which to attach the provided buffer.
1492 * - buf: the address of the provided external storage buffer.
1493 * - size: the size of the provided buffer.
1494 * - freef: a pointer to a routine that is responsible for freeing the
1495 * provided external storage buffer.
1496 * - args: a pointer to an argument structure (of any type) to be passed
1497 * to the provided freef routine (may be NULL).
1498 * - flags: any other flags to be passed to the provided mbuf.
1499 * - type: the type that the external storage buffer should be labeled with.
1500 */
1501 void
1502 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
1503 void (*freef)(void *, void *), void *args, int flags, int type)
1504 {
1505 u_int *ref_cnt = NULL;
1506
1507 if (type == EXT_CLUSTER)
1508 ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)];
1509 else if (type == EXT_EXTREF)
1510 ref_cnt = mb->m_ext.ref_cnt;
1511 _mext_init_ref(mb, ref_cnt);
1512 if (mb->m_ext.ref_cnt != NULL) {
1513 mb->m_flags |= (M_EXT | flags);
1514 mb->m_ext.ext_buf = buf;
1515 mb->m_data = mb->m_ext.ext_buf;
1516 mb->m_ext.ext_size = size;
1517 mb->m_ext.ext_free = freef;
1518 mb->m_ext.ext_args = args;
1519 mb->m_ext.ext_type = type;
1520 }
1521 }
1522
1523 /*
1524 * Change type of provided mbuf. This is a relatively expensive operation
1525 * (due to the cost of statistics manipulations) and should be avoided, where
1526 * possible.
1527 *
1528 * Arguments:
1529 * - mb: the provided mbuf for which the type needs to be changed.
1530 * - new_type: the new type to change the mbuf to.
1531 */
1532 void
1533 m_chtype(struct mbuf *mb, short new_type)
1534 {
1535 struct mb_gen_list *gen_list;
1536
1537 gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
1538 MB_LOCK_CONT(gen_list);
1539 MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
1540 MB_MBTYPES_INC(gen_list, new_type, 1);
1541 MB_UNLOCK_CONT(gen_list);
1542 mb->m_type = new_type;
1543 }
Cache object: e24f9a56313076784c6a7dd5f9f90471
|