FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_mbuf.c
1 /*-
2 * Copyright (c) 2001, 2002
3 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 * $FreeBSD: releng/5.0/sys/kern/subr_mbuf.c 107297 2002-11-27 04:26:00Z tjr $
29 */
30
31 #include "opt_mac.h"
32 #include "opt_param.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/mac.h>
38 #include <sys/mbuf.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/condvar.h>
42 #include <sys/smp.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/domain.h>
46 #include <sys/protosw.h>
47
48 #include <vm/vm.h>
49 #include <vm/vm_kern.h>
50 #include <vm/vm_extern.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53
54 /******************************************************************************
55 * mb_alloc mbuf and cluster allocator.
56 *
57 * Maximum number of PCPU containers. If you know what you're doing you could
58 * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
59 * system during compilation, and thus prevent kernel structure bloat.
60 *
61 * SMP and non-SMP kernels clearly have a different number of possible CPUs,
62 * but because we cannot assume a dense array of CPUs, we always allocate
63 * and traverse PCPU containers up to NCPU amount and merely check for
64 * CPU availability.
65 */
66 #ifdef MBALLOC_NCPU
67 #define NCPU MBALLOC_NCPU
68 #else
69 #define NCPU MAXCPU
70 #endif
71
72 /*-
73 * The mbuf allocator is heavily based on Alfred Perlstein's
74 * (alfred@FreeBSD.org) "memcache" allocator which is itself based
75 * on concepts from several per-CPU memory allocators. The difference
76 * between this allocator and memcache is that, among other things:
77 *
78 * (i) We don't free back to the map from the free() routine - we leave the
79 * option of implementing lazy freeing (from a kproc) in the future.
80 *
81 * (ii) We allocate from separate sub-maps of kmem_map, thus limiting the
82 * maximum number of allocatable objects of a given type. Further,
83 * we handle blocking on a cv in the case that the map is starved and
84 * we have to rely solely on cached (circulating) objects.
85 *
86 * The mbuf allocator keeps all objects that it allocates in mb_buckets.
87 * The buckets keep a page worth of objects (an object can be an mbuf or an
88 * mbuf cluster) and facilitate moving larger sets of contiguous objects
89 * from the per-CPU lists to the main list for the given object. The buckets
90 * also have an added advantage in that after several moves from a per-CPU
91 * list to the main list and back to the per-CPU list, contiguous objects
92 * are kept together, thus trying to put the TLB cache to good use.
93 *
94 * The buckets are kept on singly-linked lists called "containers." A container
95 * is protected by a mutex lock in order to ensure consistency. The mutex lock
96 * itself is allocated separately and attached to the container at boot time,
97 * thus allowing for certain containers to share the same mutex lock. Per-CPU
98 * containers for mbufs and mbuf clusters all share the same per-CPU
99 * lock whereas the "general system" containers (i.e., the "main lists") for
100 * these objects share one global lock.
101 */
102 struct mb_bucket {
103 SLIST_ENTRY(mb_bucket) mb_blist;
104 int mb_owner;
105 int mb_numfree;
106 void *mb_free[0];
107 };
108
109 struct mb_container {
110 SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
111 struct mtx *mc_lock;
112 int mc_numowner;
113 u_int mc_starved;
114 long *mc_types;
115 u_long *mc_objcount;
116 u_long *mc_numpgs;
117 };
118
119 struct mb_gen_list {
120 struct mb_container mb_cont;
121 struct cv mgl_mstarved;
122 };
123
124 struct mb_pcpu_list {
125 struct mb_container mb_cont;
126 };
127
128 /*
129 * Boot-time configurable object counts that will determine the maximum
130 * number of permitted objects in the mbuf and mcluster cases. In the
131 * ext counter (nmbcnt) case, it's just an indicator serving to scale
132 * kmem_map size properly - in other words, we may be allowed to allocate
133 * more than nmbcnt counters, whereas we will never be allowed to allocate
134 * more than nmbufs mbufs or nmbclusters mclusters.
135 * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
136 * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
137 */
138 #ifndef NMBCLUSTERS
139 #define NMBCLUSTERS (1024 + maxusers * 64)
140 #endif
141 #ifndef NMBUFS
142 #define NMBUFS (nmbclusters * 2)
143 #endif
144 #ifndef NSFBUFS
145 #define NSFBUFS (512 + maxusers * 16)
146 #endif
147 #ifndef NMBCNTS
148 #define NMBCNTS (nmbclusters + nsfbufs)
149 #endif
150 int nmbufs;
151 int nmbclusters;
152 int nmbcnt;
153 int nsfbufs;
154
155 /*
156 * Perform sanity checks of tunables declared above.
157 */
158 static void
159 tunable_mbinit(void *dummy)
160 {
161
162 /*
163 * This has to be done before VM init.
164 */
165 nmbclusters = NMBCLUSTERS;
166 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
167 nmbufs = NMBUFS;
168 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
169 nsfbufs = NSFBUFS;
170 TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
171 nmbcnt = NMBCNTS;
172 TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
173 /* Sanity checks */
174 if (nmbufs < nmbclusters * 2)
175 nmbufs = nmbclusters * 2;
176 if (nmbcnt < nmbclusters + nsfbufs)
177 nmbcnt = nmbclusters + nsfbufs;
178 }
179 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
180
181 /*
182 * The freelist structures and mutex locks. The number statically declared
183 * here depends on the number of CPUs.
184 *
185 * We set up in such a way that all the objects (mbufs, clusters)
186 * share the same mutex lock. It has been established that we do not benefit
187 * from different locks for different objects, so we use the same lock,
188 * regardless of object type. This also allows us to do optimised
189 * multi-object allocations without dropping the lock in between.
190 */
191 struct mb_lstmngr {
192 struct mb_gen_list *ml_genlist;
193 struct mb_pcpu_list *ml_cntlst[NCPU];
194 struct mb_bucket **ml_btable;
195 vm_map_t ml_map;
196 vm_offset_t ml_mapbase;
197 vm_offset_t ml_maptop;
198 int ml_mapfull;
199 u_int ml_objsize;
200 u_int *ml_wmhigh;
201 };
202 static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
203 static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
204 u_int *cl_refcntmap;
205
206 /*
207 * Local macros for internal allocator structure manipulations.
208 */
209 #ifdef SMP
210 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
211 #else
212 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
213 #endif
214
215 #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
216
217 #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
218
219 #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
220
221 #define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
222 (mb_lst)->ml_cntlst[(num)]
223
224 #define MB_BUCKET_INDX(mb_obj, mb_lst) \
225 (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
226
227 #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
228 { \
229 struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
230 \
231 (mb_bckt)->mb_numfree--; \
232 (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
233 (*((mb_lst)->mb_cont.mc_objcount))--; \
234 if ((mb_bckt)->mb_numfree == 0) { \
235 SLIST_REMOVE_HEAD(_mchd, mb_blist); \
236 SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
237 (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
238 } \
239 }
240
241 #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
242 (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
243 (mb_bckt)->mb_numfree++; \
244 (*((mb_lst)->mb_cont.mc_objcount))++;
245
246 #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
247 if ((mb_type) != MT_NOTMBUF) \
248 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
249
250 #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
251 if ((mb_type) != MT_NOTMBUF) \
252 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
253
254 /*
255 * Ownership of buckets/containers is represented by integers. The PCPU
256 * lists range from 0 to NCPU-1. We need a free numerical id for the general
257 * list (we use NCPU). We also need a non-conflicting free bit to indicate
258 * that the bucket is free and removed from a container, while not losing
259 * the bucket's originating container id. We use the highest bit
260 * for the free marker.
261 */
262 #define MB_GENLIST_OWNER (NCPU)
263 #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
264
265 /* Statistics structures for allocator (per-CPU and general). */
266 static struct mbpstat mb_statpcpu[NCPU + 1];
267 struct mbstat mbstat;
268
269 /* Sleep time for wait code (in ticks). */
270 static int mbuf_wait = 64;
271
272 static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */
273 static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */
274
275 /*
276 * Objects exported by sysctl(8).
277 */
278 SYSCTL_DECL(_kern_ipc);
279 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RD, &nmbclusters, 0,
280 "Maximum number of mbuf clusters available");
281 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RD, &nmbufs, 0,
282 "Maximum number of mbufs available");
283 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RD, &nmbcnt, 0,
284 "Number used to scale kmem_map to ensure sufficient space for counters");
285 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RD, &nsfbufs, 0,
286 "Maximum number of sendfile(2) sf_bufs available");
287 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
288 "Sleep time of mbuf subsystem wait allocations during exhaustion");
289 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_limit, CTLFLAG_RW, &mbuf_limit, 0,
290 "Upper limit of number of mbufs allowed on each PCPU list");
291 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_limit, CTLFLAG_RW, &clust_limit, 0,
292 "Upper limit of number of mbuf clusters allowed on each PCPU list");
293 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
294 "Mbuf general information and statistics");
295 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
296 sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
297
298 /*
299 * Prototypes of local allocator routines.
300 */
301 static void *mb_alloc_wait(struct mb_lstmngr *, short);
302 static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
303 struct mb_pcpu_list *);
304 static void mb_reclaim(void);
305 static void mbuf_init(void *);
306
307 /*
308 * Initial allocation numbers. Each parameter represents the number of buckets
309 * of each object that will be placed initially in each PCPU container for
310 * said object.
311 */
312 #define NMB_MBUF_INIT 4
313 #define NMB_CLUST_INIT 16
314
315 /*
316 * Internal flags that allow for cache locks to remain "persistent" across
317 * allocation and free calls. They may be used in combination.
318 */
319 #define MBP_PERSIST 0x1 /* Return with lock still held. */
320 #define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */
321
322 /*
323 * Initialize the mbuf subsystem.
324 *
325 * We sub-divide the kmem_map into several submaps; this way, we don't have
326 * to worry about artificially limiting the number of mbuf or mbuf cluster
327 * allocations, due to fear of one type of allocation "stealing" address
328 * space initially reserved for another.
329 *
330 * Set up both the general containers and all the PCPU containers. Populate
331 * the PCPU containers with initial numbers.
332 */
333 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
334 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
335 static void
336 mbuf_init(void *dummy)
337 {
338 struct mb_pcpu_list *pcpu_cnt;
339 vm_size_t mb_map_size;
340 int i, j;
341
342 /*
343 * Set up all the submaps, for each type of object that we deal
344 * with in this allocator. We also allocate space for the cluster
345 * ref. counts in the mbuf map (and not the cluster map) in order to
346 * give clusters a nice contiguous address space without any holes.
347 */
348 mb_map_size = (vm_size_t)(nmbufs * MSIZE + nmbclusters * sizeof(u_int));
349 mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
350 mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE *
351 sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
352 if (mb_list_mbuf.ml_btable == NULL)
353 goto bad;
354 mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
355 &(mb_list_mbuf.ml_maptop), mb_map_size);
356 mb_list_mbuf.ml_map->system_map = 1;
357 mb_list_mbuf.ml_mapfull = 0;
358 mb_list_mbuf.ml_objsize = MSIZE;
359 mb_list_mbuf.ml_wmhigh = &mbuf_limit;
360
361 mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
362 mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
363 mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size / PAGE_SIZE
364 * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
365 if (mb_list_clust.ml_btable == NULL)
366 goto bad;
367 mb_list_clust.ml_map = kmem_suballoc(kmem_map,
368 &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
369 mb_map_size);
370 mb_list_clust.ml_map->system_map = 1;
371 mb_list_clust.ml_mapfull = 0;
372 mb_list_clust.ml_objsize = MCLBYTES;
373 mb_list_clust.ml_wmhigh = &clust_limit;
374
375 /*
376 * Allocate required general (global) containers for each object type.
377 */
378 mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
379 M_NOWAIT);
380 mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
381 M_NOWAIT);
382 if ((mb_list_mbuf.ml_genlist == NULL) ||
383 (mb_list_clust.ml_genlist == NULL))
384 goto bad;
385
386 /*
387 * Initialize condition variables and general container mutex locks.
388 */
389 mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
390 cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
391 cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
392 "mcluster pool starved");
393 mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
394 mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
395
396 /*
397 * Set up the general containers for each object.
398 */
399 mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
400 mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
401 mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
402 mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
403 mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
404 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
405 mb_list_clust.ml_genlist->mb_cont.mc_objcount =
406 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
407 mb_list_mbuf.ml_genlist->mb_cont.mc_numpgs =
408 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbpgs);
409 mb_list_clust.ml_genlist->mb_cont.mc_numpgs =
410 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
411 mb_list_mbuf.ml_genlist->mb_cont.mc_types =
412 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
413 mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
414 SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
415 SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
416
417 /*
418 * Allocate all the required counters for clusters. This makes
419 * cluster allocations/deallocations much faster.
420 */
421 cl_refcntmap = (u_int *)kmem_malloc(mb_list_mbuf.ml_map,
422 roundup(nmbclusters * sizeof(u_int), MSIZE), M_NOWAIT);
423 if (cl_refcntmap == NULL)
424 goto bad;
425
426 /*
427 * Initialize general mbuf statistics.
428 */
429 mbstat.m_msize = MSIZE;
430 mbstat.m_mclbytes = MCLBYTES;
431 mbstat.m_minclsize = MINCLSIZE;
432 mbstat.m_mlen = MLEN;
433 mbstat.m_mhlen = MHLEN;
434 mbstat.m_numtypes = MT_NTYPES;
435
436 /*
437 * Allocate and initialize PCPU containers.
438 */
439 for (i = 0; i < NCPU; i++) {
440 if (CPU_ABSENT(i))
441 continue;
442
443 mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
444 M_MBUF, M_NOWAIT);
445 mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
446 M_MBUF, M_NOWAIT);
447 if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
448 (mb_list_clust.ml_cntlst[i] == NULL))
449 goto bad;
450
451 mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
452 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
453 mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
454
455 mb_statpcpu[i].mb_active = 1;
456 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
457 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
458 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
459 mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
460 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
461 &(mb_statpcpu[i].mb_mbfree);
462 mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
463 &(mb_statpcpu[i].mb_clfree);
464 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numpgs =
465 &(mb_statpcpu[i].mb_mbpgs);
466 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numpgs =
467 &(mb_statpcpu[i].mb_clpgs);
468 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
469 &(mb_statpcpu[i].mb_mbtypes[0]);
470 mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
471
472 SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
473 SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
474
475 /*
476 * Perform initial allocations.
477 */
478 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
479 MB_LOCK_CONT(pcpu_cnt);
480 for (j = 0; j < NMB_MBUF_INIT; j++) {
481 if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
482 == NULL)
483 goto bad;
484 }
485 MB_UNLOCK_CONT(pcpu_cnt);
486
487 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
488 MB_LOCK_CONT(pcpu_cnt);
489 for (j = 0; j < NMB_CLUST_INIT; j++) {
490 if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
491 == NULL)
492 goto bad;
493 }
494 MB_UNLOCK_CONT(pcpu_cnt);
495 }
496
497 return;
498 bad:
499 panic("mbuf_init(): failed to initialize mbuf subsystem!");
500 }
501
502 /*
503 * Populate a given mbuf PCPU container with a bucket full of fresh new
504 * buffers. Return a pointer to the new bucket (already in the container if
505 * successful), or return NULL on failure.
506 *
507 * LOCKING NOTES:
508 * PCPU container lock must be held when this is called.
509 * The lock is dropped here so that we can cleanly call the underlying VM
510 * code. If we fail, we return with no locks held. If we succeed (i.e., return
511 * non-NULL), we return with the PCPU lock held, ready for allocation from
512 * the returned bucket.
513 */
514 static struct mb_bucket *
515 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
516 {
517 struct mb_bucket *bucket;
518 caddr_t p;
519 int i;
520
521 MB_UNLOCK_CONT(cnt_lst);
522 /*
523 * If our object's (finite) map is starved now (i.e., no more address
524 * space), bail out now.
525 */
526 if (mb_list->ml_mapfull)
527 return (NULL);
528
529 bucket = malloc(sizeof(struct mb_bucket) +
530 PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
531 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
532 if (bucket == NULL)
533 return (NULL);
534
535 p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
536 how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
537 if (p == NULL) {
538 free(bucket, M_MBUF);
539 if (how == M_TRYWAIT)
540 mb_list->ml_mapfull = 1;
541 return (NULL);
542 }
543
544 bucket->mb_numfree = 0;
545 mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
546 for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
547 bucket->mb_free[i] = p;
548 bucket->mb_numfree++;
549 p += mb_list->ml_objsize;
550 }
551
552 MB_LOCK_CONT(cnt_lst);
553 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
554 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
555 (*(cnt_lst->mb_cont.mc_numpgs))++;
556 *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
557
558 return (bucket);
559 }
560
561 /*
562 * Allocate an mbuf-subsystem type object.
563 * The general case is very easy. Complications only arise if our PCPU
564 * container is empty. Things get worse if the PCPU container is empty,
565 * the general container is empty, and we've run out of address space
566 * in our map; then we try to block if we're willing to (M_TRYWAIT).
567 */
568 static __inline
569 void *
570 mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist,
571 int *pers_list)
572 {
573 static int last_report;
574 struct mb_pcpu_list *cnt_lst;
575 struct mb_bucket *bucket;
576 void *m;
577
578 m = NULL;
579 if ((persist & MBP_PERSISTENT) != 0) {
580 /*
581 * If we're a "persistent" call, then the per-CPU #(pers_list)
582 * cache lock is already held, and we just need to refer to
583 * the correct cache descriptor.
584 */
585 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
586 } else {
587 cnt_lst = MB_GET_PCPU_LIST(mb_list);
588 MB_LOCK_CONT(cnt_lst);
589 }
590
591 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
592 /*
593 * This is the easy allocation case. We just grab an object
594 * from a bucket in the PCPU container. At worst, we
595 * have just emptied the bucket and so we remove it
596 * from the container.
597 */
598 MB_GET_OBJECT(m, bucket, cnt_lst);
599 MB_MBTYPES_INC(cnt_lst, type, 1);
600
601 /* If asked to persist, do not drop the lock. */
602 if ((persist & MBP_PERSIST) == 0)
603 MB_UNLOCK_CONT(cnt_lst);
604 else
605 *pers_list = cnt_lst->mb_cont.mc_numowner;
606 } else {
607 struct mb_gen_list *gen_list;
608
609 /*
610 * This is the less-common more difficult case. We must
611 * first verify if the general list has anything for us
612 * and if that also fails, we must allocate a page from
613 * the map and create a new bucket to place in our PCPU
614 * container (already locked). If the map is starved then
615 * we're really in for trouble, as we have to wait on
616 * the general container's condition variable.
617 */
618 gen_list = MB_GET_GEN_LIST(mb_list);
619 MB_LOCK_CONT(gen_list);
620
621 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
622 != NULL) {
623 /*
624 * Give ownership of the bucket to our CPU's
625 * container, but only actually put the bucket
626 * in the container if it doesn't become free
627 * upon removing an mbuf from it.
628 */
629 SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
630 mb_blist);
631 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
632 (*(gen_list->mb_cont.mc_numpgs))--;
633 (*(cnt_lst->mb_cont.mc_numpgs))++;
634 *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
635 bucket->mb_numfree--;
636 m = bucket->mb_free[(bucket->mb_numfree)];
637 if (bucket->mb_numfree == 0) {
638 SLIST_NEXT(bucket, mb_blist) = NULL;
639 bucket->mb_owner |= MB_BUCKET_FREE;
640 } else {
641 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
642 bucket, mb_blist);
643 *(cnt_lst->mb_cont.mc_objcount) +=
644 bucket->mb_numfree;
645 }
646 MB_UNLOCK_CONT(gen_list);
647 MB_MBTYPES_INC(cnt_lst, type, 1);
648
649 /* If asked to persist, do not drop the lock. */
650 if ((persist & MBP_PERSIST) == 0)
651 MB_UNLOCK_CONT(cnt_lst);
652 else
653 *pers_list = cnt_lst->mb_cont.mc_numowner;
654 } else {
655 /*
656 * We'll have to allocate a new page.
657 */
658 MB_UNLOCK_CONT(gen_list);
659 bucket = mb_pop_cont(mb_list, how, cnt_lst);
660 if (bucket != NULL) {
661 MB_GET_OBJECT(m, bucket, cnt_lst);
662 MB_MBTYPES_INC(cnt_lst, type, 1);
663
664 /* If asked to persist, do not drop the lock. */
665 if ((persist & MBP_PERSIST) == 0)
666 MB_UNLOCK_CONT(cnt_lst);
667 else
668 *pers_list=cnt_lst->mb_cont.mc_numowner;
669 } else {
670 if (how == M_TRYWAIT) {
671 /*
672 * Absolute worst-case scenario.
673 * We block if we're willing to, but
674 * only after trying to steal from
675 * other lists.
676 */
677 m = mb_alloc_wait(mb_list, type);
678 } else {
679 /* XXX: No consistency. */
680 mbstat.m_drops++;
681
682 if (ticks < last_report ||
683 (ticks - last_report) >= hz) {
684 last_report = ticks;
685 printf(
686 "All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
687 }
688
689 }
690 if (m != NULL && (persist & MBP_PERSIST) != 0) {
691 cnt_lst = MB_GET_PCPU_LIST(mb_list);
692 MB_LOCK_CONT(cnt_lst);
693 *pers_list=cnt_lst->mb_cont.mc_numowner;
694 }
695 }
696 }
697 }
698
699 return (m);
700 }
701
702 /*
703 * This is the worst-case scenario called only if we're allocating with
704 * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
705 * by looking in every PCPU container. If we're still unsuccesful, we
706 * try the general container one last time and possibly block on our
707 * starved cv.
708 */
709 static void *
710 mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
711 {
712 struct mb_pcpu_list *cnt_lst;
713 struct mb_gen_list *gen_list;
714 struct mb_bucket *bucket;
715 void *m;
716 int i, cv_ret;
717
718 /*
719 * Try to reclaim mbuf-related objects (mbufs, clusters).
720 */
721 mb_reclaim();
722
723 /*
724 * Cycle all the PCPU containers. Increment starved counts if found
725 * empty.
726 */
727 for (i = 0; i < NCPU; i++) {
728 if (CPU_ABSENT(i))
729 continue;
730 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
731 MB_LOCK_CONT(cnt_lst);
732
733 /*
734 * If container is non-empty, get a single object from it.
735 * If empty, increment starved count.
736 */
737 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
738 NULL) {
739 MB_GET_OBJECT(m, bucket, cnt_lst);
740 MB_MBTYPES_INC(cnt_lst, type, 1);
741 MB_UNLOCK_CONT(cnt_lst);
742 mbstat.m_wait++; /* XXX: No consistency. */
743 return (m);
744 } else
745 cnt_lst->mb_cont.mc_starved++;
746
747 MB_UNLOCK_CONT(cnt_lst);
748 }
749
750 /*
751 * We're still here, so that means it's time to get the general
752 * container lock, check it one more time (now that mb_reclaim()
753 * has been called) and if we still get nothing, block on the cv.
754 */
755 gen_list = MB_GET_GEN_LIST(mb_list);
756 MB_LOCK_CONT(gen_list);
757 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
758 MB_GET_OBJECT(m, bucket, gen_list);
759 MB_MBTYPES_INC(gen_list, type, 1);
760 MB_UNLOCK_CONT(gen_list);
761 mbstat.m_wait++; /* XXX: No consistency. */
762 return (m);
763 }
764
765 gen_list->mb_cont.mc_starved++;
766 cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
767 gen_list->mb_cont.mc_lock, mbuf_wait);
768 gen_list->mb_cont.mc_starved--;
769
770 if ((cv_ret == 0) &&
771 ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
772 MB_GET_OBJECT(m, bucket, gen_list);
773 MB_MBTYPES_INC(gen_list, type, 1);
774 mbstat.m_wait++; /* XXX: No consistency. */
775 } else {
776 mbstat.m_drops++; /* XXX: No consistency. */
777 m = NULL;
778 }
779
780 MB_UNLOCK_CONT(gen_list);
781
782 return (m);
783 }
784
785 /*-
786 * Free an object to its rightful container.
787 * In the very general case, this operation is really very easy.
788 * Complications arise primarily if:
789 * (a) We've hit the high limit on number of free objects allowed in
790 * our PCPU container.
791 * (b) We're in a critical situation where our container has been
792 * marked 'starved' and we need to issue wakeups on the starved
793 * condition variable.
794 * (c) Minor (odd) cases: our bucket has migrated while we were
795 * waiting for the lock; our bucket is in the general container;
796 * our bucket is empty.
797 */
798 static __inline
799 void
800 mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
801 int *pers_list)
802 {
803 struct mb_pcpu_list *cnt_lst;
804 struct mb_gen_list *gen_list;
805 struct mb_bucket *bucket;
806 u_int owner;
807
808 bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
809
810 /*
811 * Make sure that if after we lock the bucket's present container the
812 * bucket has migrated, that we drop the lock and get the new one.
813 */
814 retry_lock:
815 owner = bucket->mb_owner & ~MB_BUCKET_FREE;
816 switch (owner) {
817 case MB_GENLIST_OWNER:
818 gen_list = MB_GET_GEN_LIST(mb_list);
819 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
820 if (*pers_list != MB_GENLIST_OWNER) {
821 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
822 *pers_list);
823 MB_UNLOCK_CONT(cnt_lst);
824 MB_LOCK_CONT(gen_list);
825 }
826 } else {
827 MB_LOCK_CONT(gen_list);
828 }
829 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
830 MB_UNLOCK_CONT(gen_list);
831 *pers_list = -1;
832 goto retry_lock;
833 }
834
835 /*
836 * If we're intended for the general container, this is
837 * real easy: no migrating required. The only `bogon'
838 * is that we're now contending with all the threads
839 * dealing with the general list, but this is expected.
840 */
841 MB_PUT_OBJECT(m, bucket, gen_list);
842 MB_MBTYPES_DEC(gen_list, type, 1);
843 if (gen_list->mb_cont.mc_starved > 0)
844 cv_signal(&(gen_list->mgl_mstarved));
845 if ((persist & MBP_PERSIST) == 0)
846 MB_UNLOCK_CONT(gen_list);
847 else
848 *pers_list = MB_GENLIST_OWNER;
849 break;
850
851 default:
852 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
853 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
854 if (*pers_list == MB_GENLIST_OWNER) {
855 gen_list = MB_GET_GEN_LIST(mb_list);
856 MB_UNLOCK_CONT(gen_list);
857 MB_LOCK_CONT(cnt_lst);
858 } else {
859 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
860 *pers_list);
861 owner = *pers_list;
862 }
863 } else {
864 MB_LOCK_CONT(cnt_lst);
865 }
866 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
867 MB_UNLOCK_CONT(cnt_lst);
868 *pers_list = -1;
869 goto retry_lock;
870 }
871
872 MB_PUT_OBJECT(m, bucket, cnt_lst);
873 MB_MBTYPES_DEC(cnt_lst, type, 1);
874
875 if (cnt_lst->mb_cont.mc_starved > 0) {
876 /*
877 * This is a tough case. It means that we've
878 * been flagged at least once to indicate that
879 * we're empty, and that the system is in a critical
880 * situation, so we ought to migrate at least one
881 * bucket over to the general container.
882 * There may or may not be a thread blocking on
883 * the starved condition variable, but chances
884 * are that one will eventually come up soon so
885 * it's better to migrate now than never.
886 */
887 gen_list = MB_GET_GEN_LIST(mb_list);
888 MB_LOCK_CONT(gen_list);
889 KASSERT((bucket->mb_owner & MB_BUCKET_FREE) != 0,
890 ("mb_free: corrupt bucket %p\n", bucket));
891 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
892 bucket, mb_blist);
893 bucket->mb_owner = MB_GENLIST_OWNER;
894 (*(cnt_lst->mb_cont.mc_objcount))--;
895 (*(gen_list->mb_cont.mc_objcount))++;
896 (*(cnt_lst->mb_cont.mc_numpgs))--;
897 (*(gen_list->mb_cont.mc_numpgs))++;
898
899 /*
900 * Determine whether or not to keep transferring
901 * buckets to the general list or whether we've
902 * transferred enough already.
903 * We realize that although we may flag another
904 * bucket to be migrated to the general container
905 * that in the meantime, the thread that was
906 * blocked on the cv is already woken up and
907 * long gone. But in that case, the worst
908 * consequence is that we will end up migrating
909 * one bucket too many, which is really not a big
910 * deal, especially if we're close to a critical
911 * situation.
912 */
913 if (gen_list->mb_cont.mc_starved > 0) {
914 cnt_lst->mb_cont.mc_starved--;
915 cv_signal(&(gen_list->mgl_mstarved));
916 } else
917 cnt_lst->mb_cont.mc_starved = 0;
918
919 MB_UNLOCK_CONT(gen_list);
920 if ((persist & MBP_PERSIST) == 0)
921 MB_UNLOCK_CONT(cnt_lst);
922 else
923 *pers_list = owner;
924 break;
925 }
926
927 if (*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) {
928 /*
929 * We've hit the high limit of allowed numbers of mbufs
930 * on this PCPU list. We must now migrate a bucket
931 * over to the general container.
932 */
933 gen_list = MB_GET_GEN_LIST(mb_list);
934 MB_LOCK_CONT(gen_list);
935 if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
936 bucket =
937 SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
938 SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
939 mb_blist);
940 }
941 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
942 bucket, mb_blist);
943 bucket->mb_owner = MB_GENLIST_OWNER;
944 *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
945 *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
946 (*(cnt_lst->mb_cont.mc_numpgs))--;
947 (*(gen_list->mb_cont.mc_numpgs))++;
948
949 /*
950 * While we're at it, transfer some of the mbtypes
951 * "count load" onto the general list's mbtypes
952 * array, seeing as how we're moving the bucket
953 * there now, meaning that the freeing of objects
954 * there will now decrement the _general list's_
955 * mbtypes counters, and no longer our PCPU list's
956 * mbtypes counters. We do this for the type presently
957 * being freed in an effort to keep the mbtypes
958 * counters approximately balanced across all lists.
959 */
960 MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
961 mb_list->ml_objsize) - bucket->mb_numfree);
962 MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
963 mb_list->ml_objsize) - bucket->mb_numfree);
964
965 MB_UNLOCK_CONT(gen_list);
966 if ((persist & MBP_PERSIST) == 0)
967 MB_UNLOCK_CONT(cnt_lst);
968 else
969 *pers_list = owner;
970 break;
971 }
972
973 if (bucket->mb_owner & MB_BUCKET_FREE) {
974 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
975 bucket, mb_blist);
976 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
977 }
978
979 if ((persist & MBP_PERSIST) == 0)
980 MB_UNLOCK_CONT(cnt_lst);
981 else
982 *pers_list = owner;
983 break;
984 }
985 }
986
987 /*
988 * Drain protocols in hopes to free up some resources.
989 *
990 * LOCKING NOTES:
991 * No locks should be held when this is called. The drain routines have to
992 * presently acquire some locks which raises the possibility of lock order
993 * violation if we're holding any mutex if that mutex is acquired in reverse
994 * order relative to one of the locks in the drain routines.
995 */
996 static void
997 mb_reclaim(void)
998 {
999 struct domain *dp;
1000 struct protosw *pr;
1001
1002 /*
1003 * XXX: Argh, we almost always trip here with witness turned on now-a-days
1004 * XXX: because we often come in with Giant held. For now, there's no way
1005 * XXX: to avoid this.
1006 */
1007 #ifdef WITNESS
1008 KASSERT(witness_list(curthread) == 0,
1009 ("mb_reclaim() called with locks held"));
1010 #endif
1011
1012 mbstat.m_drain++; /* XXX: No consistency. */
1013
1014 for (dp = domains; dp != NULL; dp = dp->dom_next)
1015 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
1016 if (pr->pr_drain != NULL)
1017 (*pr->pr_drain)();
1018 }
1019
1020 /******************************************************************************
1021 * Internal setup macros.
1022 */
1023
1024 #define _mb_setup(m, type) do { \
1025 (m)->m_type = (type); \
1026 (m)->m_next = NULL; \
1027 (m)->m_nextpkt = NULL; \
1028 (m)->m_data = (m)->m_dat; \
1029 (m)->m_flags = 0; \
1030 } while (0)
1031
1032 #define _mbhdr_setup(m, type) do { \
1033 (m)->m_type = (type); \
1034 (m)->m_next = NULL; \
1035 (m)->m_nextpkt = NULL; \
1036 (m)->m_data = (m)->m_pktdat; \
1037 (m)->m_flags = M_PKTHDR; \
1038 (m)->m_pkthdr.rcvif = NULL; \
1039 (m)->m_pkthdr.csum_flags = 0; \
1040 SLIST_INIT(&(m)->m_pkthdr.tags); \
1041 } while (0)
1042
1043 #define _mcl_setup(m) do { \
1044 (m)->m_data = (m)->m_ext.ext_buf; \
1045 (m)->m_flags |= M_EXT; \
1046 (m)->m_ext.ext_free = NULL; \
1047 (m)->m_ext.ext_args = NULL; \
1048 (m)->m_ext.ext_size = MCLBYTES; \
1049 (m)->m_ext.ext_type = EXT_CLUSTER; \
1050 } while (0)
1051
1052 #define _mext_init_ref(m, ref) do { \
1053 (m)->m_ext.ref_cnt = ((ref) == NULL) ? \
1054 malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \
1055 if ((m)->m_ext.ref_cnt != NULL) { \
1056 *((m)->m_ext.ref_cnt) = 0; \
1057 MEXT_ADD_REF((m)); \
1058 } \
1059 } while (0)
1060
1061 #define cl2ref(cl) \
1062 (((uintptr_t)(cl) - (uintptr_t)cl_refcntmap) >> MCLSHIFT)
1063
1064 #define _mext_dealloc_ref(m) \
1065 free((m)->m_ext.ref_cnt, M_MBUF)
1066
1067 /******************************************************************************
1068 * Internal routines.
1069 *
1070 * Because mb_alloc() and mb_free() are inlines (to keep the common
1071 * cases down to a maximum of one function call), below are a few
1072 * routines used only internally for the sole purpose of making certain
1073 * functions smaller.
1074 *
1075 * - _mext_free(): frees associated storage when the ref. count is
1076 * exactly one and we're freeing.
1077 *
1078 * - _mgetm_internal(): common "persistent-lock" routine that allocates
1079 * an mbuf and a cluster in one shot, but where the lock is already
1080 * held coming in (which is what makes it different from the exported
1081 * m_getcl()). The lock is dropped when done. This is used by m_getm()
1082 * and, therefore, is very m_getm()-specific.
1083 */
1084 static struct mbuf *_mgetm_internal(int, short, short, int);
1085
1086 void
1087 _mext_free(struct mbuf *mb)
1088 {
1089
1090 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1091 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1092 0, NULL);
1093 } else {
1094 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
1095 _mext_dealloc_ref(mb);
1096 }
1097 }
1098
1099 static struct mbuf *
1100 _mgetm_internal(int how, short type, short persist, int cchnum)
1101 {
1102 struct mbuf *mb;
1103
1104 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
1105 if (mb == NULL)
1106 return NULL;
1107 _mb_setup(mb, type);
1108
1109 if ((persist & MBP_PERSIST) != 0) {
1110 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1111 how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1112 if (mb->m_ext.ext_buf == NULL) {
1113 (void)m_free(mb);
1114 mb = NULL;
1115 }
1116 _mcl_setup(mb);
1117 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1118 }
1119 return (mb);
1120 }
1121
1122 /******************************************************************************
1123 * Exported buffer allocation and de-allocation routines.
1124 */
1125
1126 /*
1127 * Allocate and return a single (normal) mbuf. NULL is returned on failure.
1128 *
1129 * Arguments:
1130 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1131 * if really starved for memory. M_DONTWAIT to never block.
1132 * - type: the type of the mbuf being allocated.
1133 */
1134 struct mbuf *
1135 m_get(int how, short type)
1136 {
1137 struct mbuf *mb;
1138
1139 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1140 if (mb != NULL)
1141 _mb_setup(mb, type);
1142 return (mb);
1143 }
1144
1145 /*
1146 * Allocate a given length worth of mbufs and/or clusters (whatever fits
1147 * best) and return a pointer to the top of the allocated chain. If an
1148 * existing mbuf chain is provided, then we will append the new chain
1149 * to the existing one and return the top of the provided (existing)
1150 * chain. NULL is returned on failure, in which case the [optional]
1151 * provided chain is left untouched, and any memory already allocated
1152 * is freed.
1153 *
1154 * Arguments:
1155 * - m: existing chain to which to append new chain (optional).
1156 * - len: total length of data to append, either in mbufs or clusters
1157 * (we allocate whatever combination yields the best fit).
1158 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1159 * if really starved for memory. M_DONTWAIT to never block.
1160 * - type: the type of the mbuf being allocated.
1161 */
1162 struct mbuf *
1163 m_getm(struct mbuf *m, int len, int how, short type)
1164 {
1165 struct mbuf *mb, *top, *cur, *mtail;
1166 int num, rem, cchnum;
1167 short persist;
1168 int i;
1169
1170 KASSERT(len >= 0, ("m_getm(): len is < 0"));
1171
1172 /* If m != NULL, we will append to the end of that chain. */
1173 if (m != NULL)
1174 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
1175 else
1176 mtail = NULL;
1177
1178 /*
1179 * In the best-case scenario (which should be the common case
1180 * unless we're in a starvation situation), we will be able to
1181 * go through the allocation of all the desired mbufs and clusters
1182 * here without dropping our per-CPU cache lock in between.
1183 */
1184 num = len / MCLBYTES;
1185 rem = len % MCLBYTES;
1186 persist = 0;
1187 cchnum = -1;
1188 top = cur = NULL;
1189 for (i = 0; i < num; i++) {
1190 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1191 MBP_PERSIST | persist, &cchnum);
1192 if (mb == NULL)
1193 goto failed;
1194 _mb_setup(mb, type);
1195 mb->m_len = 0;
1196
1197 persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
1198 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1199 how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
1200 if (mb->m_ext.ext_buf == NULL) {
1201 (void)m_free(mb);
1202 goto failed;
1203 }
1204 _mcl_setup(mb);
1205 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1206 persist = MBP_PERSISTENT;
1207
1208 if (cur == NULL)
1209 top = cur = mb;
1210 else
1211 cur = (cur->m_next = mb);
1212 }
1213 if (rem > 0) {
1214 if (cchnum >= 0) {
1215 persist = MBP_PERSISTENT;
1216 persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
1217 mb = _mgetm_internal(how, type, persist, cchnum);
1218 if (mb == NULL)
1219 goto failed;
1220 } else if (rem > MINCLSIZE) {
1221 mb = m_getcl(how, type, 0);
1222 } else {
1223 mb = m_get(how, type);
1224 }
1225 if (mb != NULL) {
1226 mb->m_len = 0;
1227 if (cur == NULL)
1228 top = mb;
1229 else
1230 cur->m_next = mb;
1231 } else
1232 goto failed;
1233 }
1234
1235 if (mtail != NULL)
1236 mtail->m_next = top;
1237 else
1238 mtail = top;
1239 return mtail;
1240 failed:
1241 if (top != NULL)
1242 m_freem(top);
1243 return NULL;
1244 }
1245
1246 /*
1247 * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure.
1248 *
1249 * Arguments:
1250 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1251 * if really starved for memory. M_DONTWAIT to never block.
1252 * - type: the type of the mbuf being allocated.
1253 */
1254 struct mbuf *
1255 m_gethdr(int how, short type)
1256 {
1257 struct mbuf *mb;
1258
1259 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1260 if (mb != NULL) {
1261 _mbhdr_setup(mb, type);
1262 #ifdef MAC
1263 if (mac_init_mbuf(mb, how) != 0) {
1264 m_free(mb);
1265 return NULL;
1266 }
1267 #endif
1268 }
1269 return (mb);
1270 }
1271
1272 /*
1273 * Allocate and return a single (normal) pre-zero'd mbuf. NULL is
1274 * returned on failure.
1275 *
1276 * Arguments:
1277 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1278 * if really starved for memory. M_DONTWAIT to never block.
1279 * - type: the type of the mbuf being allocated.
1280 */
1281 struct mbuf *
1282 m_get_clrd(int how, short type)
1283 {
1284 struct mbuf *mb;
1285
1286 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1287 if (mb != NULL) {
1288 _mb_setup(mb, type);
1289 bzero(mtod(mb, caddr_t), MLEN);
1290 }
1291 return (mb);
1292 }
1293
1294 /*
1295 * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is
1296 * returned on failure.
1297 *
1298 * Arguments:
1299 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1300 * if really starved for memory. M_DONTWAIT to never block.
1301 * - type: the type of the mbuf being allocated.
1302 */
1303 struct mbuf *
1304 m_gethdr_clrd(int how, short type)
1305 {
1306 struct mbuf *mb;
1307
1308 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1309 if (mb != NULL) {
1310 _mbhdr_setup(mb, type);
1311 #ifdef MAC
1312 if (mac_init_mbuf(mb, how) != 0) {
1313 m_free(mb);
1314 return NULL;
1315 }
1316 #endif
1317 bzero(mtod(mb, caddr_t), MHLEN);
1318 }
1319 return (mb);
1320 }
1321
1322 /*
1323 * Free a single mbuf and any associated storage that it may have attached
1324 * to it. The associated storage may not be immediately freed if its
1325 * reference count is above 1. Returns the next mbuf in the chain following
1326 * the mbuf being freed.
1327 *
1328 * Arguments:
1329 * - mb: the mbuf to free.
1330 */
1331 struct mbuf *
1332 m_free(struct mbuf *mb)
1333 {
1334 struct mbuf *nb;
1335 int cchnum;
1336 short persist = 0;
1337
1338 if ((mb->m_flags & M_PKTHDR) != 0)
1339 m_tag_delete_chain(mb, NULL);
1340 #ifdef MAC
1341 if ((mb->m_flags & M_PKTHDR) &&
1342 (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED))
1343 mac_destroy_mbuf(mb);
1344 #endif
1345 nb = mb->m_next;
1346 if ((mb->m_flags & M_EXT) != 0) {
1347 MEXT_REM_REF(mb);
1348 if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
1349 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1350 mb_free(&mb_list_clust,
1351 (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1352 MBP_PERSIST, &cchnum);
1353 persist = MBP_PERSISTENT;
1354 } else {
1355 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
1356 mb->m_ext.ext_args);
1357 _mext_dealloc_ref(mb);
1358 persist = 0;
1359 }
1360 }
1361 }
1362 mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
1363 return (nb);
1364 }
1365
1366 /*
1367 * Free an entire chain of mbufs and associated external buffers, if
1368 * applicable. Right now, we only optimize a little so that the cache
1369 * lock may be held across a single mbuf+cluster free. Hopefully,
1370 * we'll eventually be holding the lock across more than merely two
1371 * consecutive frees but right now this is hard to implement because of
1372 * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
1373 * loop.
1374 *
1375 * - mb: the mbuf chain to free.
1376 */
1377 void
1378 m_freem(struct mbuf *mb)
1379 {
1380 struct mbuf *m;
1381 int cchnum;
1382 short persist;
1383
1384 while (mb != NULL) {
1385 if ((mb->m_flags & M_PKTHDR) != 0)
1386 m_tag_delete_chain(mb, NULL);
1387 #ifdef MAC
1388 if ((mb->m_flags & M_PKTHDR) &&
1389 (mb->m_pkthdr.label.l_flags & MAC_FLAG_INITIALIZED))
1390 mac_destroy_mbuf(mb);
1391 #endif
1392 persist = 0;
1393 m = mb;
1394 mb = mb->m_next;
1395 if ((m->m_flags & M_EXT) != 0) {
1396 MEXT_REM_REF(m);
1397 if (atomic_cmpset_int(m->m_ext.ref_cnt, 0, 1)) {
1398 if (m->m_ext.ext_type == EXT_CLUSTER) {
1399 mb_free(&mb_list_clust,
1400 (caddr_t)m->m_ext.ext_buf,
1401 MT_NOTMBUF, MBP_PERSIST, &cchnum);
1402 persist = MBP_PERSISTENT;
1403 } else {
1404 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
1405 m->m_ext.ext_args);
1406 _mext_dealloc_ref(m);
1407 persist = 0;
1408 }
1409 }
1410 }
1411 mb_free(&mb_list_mbuf, m, m->m_type, persist, &cchnum);
1412 }
1413 }
1414
1415 /*
1416 * Fetch an mbuf with a cluster attached to it. If one of the
1417 * allocations fails, the entire allocation fails. This routine is
1418 * the preferred way of fetching both the mbuf and cluster together,
1419 * as it avoids having to unlock/relock between allocations. Returns
1420 * NULL on failure.
1421 *
1422 * Arguments:
1423 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1424 * if really starved for memory. M_DONTWAIT to never block.
1425 * - type: the type of the mbuf being allocated.
1426 * - flags: any flags to pass to the mbuf being allocated; if this includes
1427 * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
1428 */
1429 struct mbuf *
1430 m_getcl(int how, short type, int flags)
1431 {
1432 struct mbuf *mb;
1433 int cchnum;
1434
1435 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1436 MBP_PERSIST, &cchnum);
1437 if (mb == NULL)
1438 return NULL;
1439 mb->m_type = type;
1440 mb->m_next = NULL;
1441 mb->m_flags = flags;
1442 if ((flags & M_PKTHDR) != 0) {
1443 mb->m_nextpkt = NULL;
1444 mb->m_pkthdr.rcvif = NULL;
1445 mb->m_pkthdr.csum_flags = 0;
1446 SLIST_INIT(&mb->m_pkthdr.tags);
1447 }
1448
1449 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
1450 MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1451 if (mb->m_ext.ext_buf == NULL) {
1452 (void)m_free(mb);
1453 mb = NULL;
1454 } else {
1455 _mcl_setup(mb);
1456 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1457 }
1458 #ifdef MAC
1459 if ((flags & M_PKTHDR) && (mac_init_mbuf(mb, how) != 0)) {
1460 m_free(mb);
1461 return NULL;
1462 }
1463 #endif
1464 return (mb);
1465 }
1466
1467 /*
1468 * Fetch a single mbuf cluster and attach it to an existing mbuf. If
1469 * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
1470 * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
1471 * The M_EXT bit is not set on failure.
1472 *
1473 * Arguments:
1474 * - mb: the existing mbuf to which to attach the allocated cluster.
1475 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1476 * if really starved for memory. M_DONTWAIT to never block.
1477 */
1478 void
1479 m_clget(struct mbuf *mb, int how)
1480 {
1481
1482 mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
1483 0, NULL);
1484 if (mb->m_ext.ext_buf != NULL) {
1485 _mcl_setup(mb);
1486 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1487 }
1488 }
1489
1490 /*
1491 * Configure a provided mbuf to refer to the provided external storage
1492 * buffer and setup a reference count for said buffer. If the setting
1493 * up of the reference count fails, the M_EXT bit will not be set. If
1494 * successfull, the M_EXT bit is set in the mbuf's flags.
1495 *
1496 * Arguments:
1497 * - mb: the existing mbuf to which to attach the provided buffer.
1498 * - buf: the address of the provided external storage buffer.
1499 * - size: the size of the provided buffer.
1500 * - freef: a pointer to a routine that is responsible for freeing the
1501 * provided external storage buffer.
1502 * - args: a pointer to an argument structure (of any type) to be passed
1503 * to the provided freef routine (may be NULL).
1504 * - flags: any other flags to be passed to the provided mbuf.
1505 * - type: the type that the external storage buffer should be labeled with.
1506 */
1507 void
1508 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
1509 void (*freef)(void *, void *), void *args, int flags, int type)
1510 {
1511
1512 _mext_init_ref(mb, ((type != EXT_CLUSTER) ?
1513 NULL : &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]));
1514 if (mb->m_ext.ref_cnt != NULL) {
1515 mb->m_flags |= (M_EXT | flags);
1516 mb->m_ext.ext_buf = buf;
1517 mb->m_data = mb->m_ext.ext_buf;
1518 mb->m_ext.ext_size = size;
1519 mb->m_ext.ext_free = freef;
1520 mb->m_ext.ext_args = args;
1521 mb->m_ext.ext_type = type;
1522 }
1523 }
1524
1525 /*
1526 * Change type of provided mbuf. This is a relatively expensive operation
1527 * (due to the cost of statistics manipulations) and should be avoided, where
1528 * possible.
1529 *
1530 * Arguments:
1531 * - mb: the provided mbuf for which the type needs to be changed.
1532 * - new_type: the new type to change the mbuf to.
1533 */
1534 void
1535 m_chtype(struct mbuf *mb, short new_type)
1536 {
1537 struct mb_gen_list *gen_list;
1538
1539 gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
1540 MB_LOCK_CONT(gen_list);
1541 MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
1542 MB_MBTYPES_INC(gen_list, new_type, 1);
1543 MB_UNLOCK_CONT(gen_list);
1544 mb->m_type = new_type;
1545 }
Cache object: 9d279b3058a13c7936b13e9eadb43f74
|