FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_mbuf.c
1 /*-
2 * Copyright (c) 2001, 2002, 2003
3 * Bosko Milekic <bmilekic@FreeBSD.org>. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD: releng/5.2/sys/kern/subr_mbuf.c 121307 2003-10-21 18:28:36Z silby $");
31
32 #include "opt_mac.h"
33 #include "opt_param.h"
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/malloc.h>
38 #include <sys/mac.h>
39 #include <sys/mbuf.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/condvar.h>
43 #include <sys/smp.h>
44 #include <sys/kernel.h>
45 #include <sys/sysctl.h>
46 #include <sys/domain.h>
47 #include <sys/protosw.h>
48
49 #include <vm/vm.h>
50 #include <vm/vm_kern.h>
51 #include <vm/vm_extern.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54
55 /*
56 * mb_alloc: network buffer allocator
57 *
58 * XXX: currently, the "low watermark" sysctl is marked read-only as its
59 * effects are not completely implemented. To be fixed soon.
60 */
61
62 /*
63 * Maximum number of PCPU containers. If you know what you're doing you could
64 * explicitly define MBALLOC_NCPU to be exactly the number of CPUs on your
65 * system during compilation, and thus prevent kernel structure bloat.
66 *
67 * SMP and non-SMP kernels clearly have a different number of possible CPUs,
68 * but because we cannot assume a dense array of CPUs, we always allocate
69 * and traverse PCPU containers up to NCPU amount and merely check for
70 * CPU availability.
71 */
72 #ifdef MBALLOC_NCPU
73 #define NCPU MBALLOC_NCPU
74 #else
75 #define NCPU MAXCPU
76 #endif
77
78 /*-
79 * The mbuf allocator is based on Alfred Perlstein's <alfred@FreeBSD.org>
80 * "memcache" proof-of-concept allocator which was itself based on
81 * several well-known SMP-friendly allocators.
82 *
83 * The mb_alloc mbuf allocator is a special when compared to other
84 * general-purpose allocators. Some things to take note of:
85 *
86 * Mbufs and mbuf clusters are two different objects. Sometimes we
87 * will allocate a single mbuf, other times a single cluster,
88 * other times both. Further, we may sometimes wish to allocate a
89 * whole chain of mbufs with clusters. This allocator will perform
90 * the common case of each scenario in one function call (this
91 * includes constructing or destructing the object) while only
92 * locking/unlocking the cache once, if it can get away with it.
93 * The caches consist of pure mbufs and pure clusters; that is
94 * there are no 'zones' containing mbufs with already pre-hooked
95 * clusters. Since we can allocate both objects atomically anyway,
96 * we don't bother fragmenting our caches for any particular 'scenarios.'
97 *
98 * We allocate from seperate sub-maps of kmem_map, thus imposing
99 * an ultimate upper-limit on the number of allocatable clusters
100 * and mbufs and also, since the clusters all come from a
101 * virtually contiguous region, we can keep reference counters
102 * for them and "allocate" them purely by indexing into a
103 * dense refcount vector.
104 *
105 * We call out to protocol drain routines (which can be hooked
106 * into us) when we're low on space.
107 *
108 * The mbuf allocator keeps all objects that it allocates in mb_buckets.
109 * The buckets keep a number of objects (an object can be an mbuf or an
110 * mbuf cluster) and facilitate moving larger sets of contiguous objects
111 * from the per-CPU caches to the global cache. The buckets also have
112 * the added advantage that objects, when migrated from cache to cache,
113 * are migrated in chunks that keep contiguous objects together,
114 * minimizing TLB pollution.
115 *
116 * The buckets are kept on singly-linked lists called "containers." A container
117 * is protected by a mutex in order to ensure consistency. The mutex
118 * itself is allocated separately and attached to the container at boot time,
119 * thus allowing for certain containers to share the same lock. Per-CPU
120 * containers for mbufs and mbuf clusters all share the same per-CPU
121 * lock whereas the global cache containers for these objects share one
122 * global lock.
123 */
124 struct mb_bucket {
125 SLIST_ENTRY(mb_bucket) mb_blist;
126 int mb_owner;
127 int mb_numfree;
128 void *mb_free[0];
129 };
130
131 struct mb_container {
132 SLIST_HEAD(mc_buckethd, mb_bucket) mc_bhead;
133 struct mtx *mc_lock;
134 int mc_numowner;
135 u_int mc_starved;
136 long *mc_types;
137 u_long *mc_objcount;
138 u_long *mc_numbucks;
139 };
140
141 struct mb_gen_list {
142 struct mb_container mb_cont;
143 struct cv mgl_mstarved;
144 };
145
146 struct mb_pcpu_list {
147 struct mb_container mb_cont;
148 };
149
150 /*
151 * Boot-time configurable object counts that will determine the maximum
152 * number of permitted objects in the mbuf and mcluster cases. In the
153 * ext counter (nmbcnt) case, it's just an indicator serving to scale
154 * kmem_map size properly - in other words, we may be allowed to allocate
155 * more than nmbcnt counters, whereas we will never be allowed to allocate
156 * more than nmbufs mbufs or nmbclusters mclusters.
157 * As for nsfbufs, it is used to indicate how many sendfile(2) buffers will be
158 * allocatable by the sfbuf allocator (found in uipc_syscalls.c)
159 */
160 #ifndef NMBCLUSTERS
161 #define NMBCLUSTERS (1024 + maxusers * 64)
162 #endif
163 #ifndef NMBUFS
164 #define NMBUFS (nmbclusters * 2)
165 #endif
166 #ifndef NSFBUFS
167 #define NSFBUFS (512 + maxusers * 16)
168 #endif
169 #ifndef NMBCNTS
170 #define NMBCNTS (nmbclusters + nsfbufs)
171 #endif
172 int nmbufs;
173 int nmbclusters;
174 int nmbcnt;
175 int nsfbufs;
176
177 /*
178 * Sizes of objects per bucket. There are this size's worth of mbufs
179 * or clusters in each bucket. Please keep these a power-of-2.
180 */
181 #define MBUF_BUCK_SZ (PAGE_SIZE * 2)
182 #define CLUST_BUCK_SZ (PAGE_SIZE * 4)
183
184 /*
185 * Perform sanity checks of tunables declared above.
186 */
187 static void
188 tunable_mbinit(void *dummy)
189 {
190
191 /*
192 * This has to be done before VM init.
193 */
194 nmbclusters = NMBCLUSTERS;
195 TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
196 nmbufs = NMBUFS;
197 TUNABLE_INT_FETCH("kern.ipc.nmbufs", &nmbufs);
198 nsfbufs = NSFBUFS;
199 TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
200 nmbcnt = NMBCNTS;
201 TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
202 /* Sanity checks */
203 if (nmbufs < nmbclusters * 2)
204 nmbufs = nmbclusters * 2;
205 if (nmbcnt < nmbclusters + nsfbufs)
206 nmbcnt = nmbclusters + nsfbufs;
207 }
208 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
209
210 /*
211 * The freelist structures and mutex locks. The number statically declared
212 * here depends on the number of CPUs.
213 *
214 * We set up in such a way that all the objects (mbufs, clusters)
215 * share the same mutex lock. It has been established that we do not benefit
216 * from different locks for different objects, so we use the same lock,
217 * regardless of object type. This also allows us to do optimised
218 * multi-object allocations without dropping the lock in between.
219 */
220 struct mb_lstmngr {
221 struct mb_gen_list *ml_genlist;
222 struct mb_pcpu_list *ml_cntlst[NCPU];
223 struct mb_bucket **ml_btable;
224 vm_map_t ml_map;
225 vm_offset_t ml_mapbase;
226 vm_offset_t ml_maptop;
227 int ml_mapfull;
228 u_int ml_objsize;
229 u_int ml_objbucks;
230 u_int *ml_wmhigh;
231 u_int *ml_wmlow;
232 };
233 static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
234 static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
235 static u_int *cl_refcntmap;
236
237 /*
238 * Local macros for internal allocator structure manipulations.
239 */
240 #ifdef SMP
241 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[PCPU_GET(cpuid)]
242 #else
243 #define MB_GET_PCPU_LIST(mb_lst) (mb_lst)->ml_cntlst[0]
244 #endif
245
246 #define MB_GET_GEN_LIST(mb_lst) (mb_lst)->ml_genlist
247
248 #define MB_LOCK_CONT(mb_cnt) mtx_lock((mb_cnt)->mb_cont.mc_lock)
249
250 #define MB_UNLOCK_CONT(mb_cnt) mtx_unlock((mb_cnt)->mb_cont.mc_lock)
251
252 #define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
253 (mb_lst)->ml_cntlst[(num)]
254
255 #define MB_BUCKET_INDX(mb_obj, mb_lst) \
256 (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / \
257 ((mb_lst)->ml_objbucks * (mb_lst)->ml_objsize))
258
259 #define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
260 { \
261 struct mc_buckethd *_mchd = &((mb_lst)->mb_cont.mc_bhead); \
262 \
263 (mb_bckt)->mb_numfree--; \
264 (mb_objp) = (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)]; \
265 (*((mb_lst)->mb_cont.mc_objcount))--; \
266 if ((mb_bckt)->mb_numfree == 0) { \
267 SLIST_REMOVE_HEAD(_mchd, mb_blist); \
268 SLIST_NEXT((mb_bckt), mb_blist) = NULL; \
269 (mb_bckt)->mb_owner |= MB_BUCKET_FREE; \
270 } \
271 }
272
273 #define MB_PUT_OBJECT(mb_objp, mb_bckt, mb_lst) \
274 (mb_bckt)->mb_free[((mb_bckt)->mb_numfree)] = (mb_objp); \
275 (mb_bckt)->mb_numfree++; \
276 (*((mb_lst)->mb_cont.mc_objcount))++;
277
278 #define MB_MBTYPES_INC(mb_cnt, mb_type, mb_num) \
279 if ((mb_type) != MT_NOTMBUF) \
280 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) += (mb_num)
281
282 #define MB_MBTYPES_DEC(mb_cnt, mb_type, mb_num) \
283 if ((mb_type) != MT_NOTMBUF) \
284 (*((mb_cnt)->mb_cont.mc_types + (mb_type))) -= (mb_num)
285
286 /*
287 * Ownership of buckets/containers is represented by integers. The PCPU
288 * lists range from 0 to NCPU-1. We need a free numerical id for the general
289 * list (we use NCPU). We also need a non-conflicting free bit to indicate
290 * that the bucket is free and removed from a container, while not losing
291 * the bucket's originating container id. We use the highest bit
292 * for the free marker.
293 */
294 #define MB_GENLIST_OWNER (NCPU)
295 #define MB_BUCKET_FREE (1 << (sizeof(int) * 8 - 1))
296
297 /* Statistics structures for allocator (per-CPU and general). */
298 static struct mbpstat mb_statpcpu[NCPU + 1];
299 struct mbstat mbstat;
300
301 /* Sleep time for wait code (in ticks). */
302 static int mbuf_wait = 64;
303
304 static u_int mbuf_hiwm = 512; /* High wm on # of mbufs per cache */
305 static u_int mbuf_lowm = 128; /* Low wm on # of mbufs per cache */
306 static u_int clust_hiwm = 128; /* High wm on # of clusters per cache */
307 static u_int clust_lowm = 16; /* Low wm on # of clusters per cache */
308
309 /*
310 * Objects exported by sysctl(8).
311 */
312 SYSCTL_DECL(_kern_ipc);
313 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RDTUN, &nmbclusters, 0,
314 "Maximum number of mbuf clusters available");
315 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbufs, CTLFLAG_RDTUN, &nmbufs, 0,
316 "Maximum number of mbufs available");
317 SYSCTL_INT(_kern_ipc, OID_AUTO, nmbcnt, CTLFLAG_RDTUN, &nmbcnt, 0,
318 "Number used to scale kmem_map to ensure sufficient space for counters");
319 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
320 "Maximum number of sendfile(2) sf_bufs available");
321 SYSCTL_INT(_kern_ipc, OID_AUTO, mbuf_wait, CTLFLAG_RW, &mbuf_wait, 0,
322 "Sleep time of mbuf subsystem wait allocations during exhaustion");
323 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_hiwm, CTLFLAG_RW, &mbuf_hiwm, 0,
324 "Upper limit of number of mbufs allowed in each cache");
325 SYSCTL_UINT(_kern_ipc, OID_AUTO, mbuf_lowm, CTLFLAG_RD, &mbuf_lowm, 0,
326 "Lower limit of number of mbufs allowed in each cache");
327 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_hiwm, CTLFLAG_RW, &clust_hiwm, 0,
328 "Upper limit of number of mbuf clusters allowed in each cache");
329 SYSCTL_UINT(_kern_ipc, OID_AUTO, clust_lowm, CTLFLAG_RD, &clust_lowm, 0,
330 "Lower limit of number of mbuf clusters allowed in each cache");
331 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
332 "Mbuf general information and statistics");
333 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
334 sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
335
336 /*
337 * Prototypes of local allocator routines.
338 */
339 static void *mb_alloc_wait(struct mb_lstmngr *, short);
340 static struct mb_bucket *mb_pop_cont(struct mb_lstmngr *, int,
341 struct mb_pcpu_list *);
342 static void mb_reclaim(void);
343 static void mbuf_init(void *);
344
345 /*
346 * Initial allocation numbers. Each parameter represents the number of buckets
347 * of each object that will be placed initially in each PCPU container for
348 * said object.
349 */
350 #define NMB_MBUF_INIT 2
351 #define NMB_CLUST_INIT 8
352
353 /*
354 * Internal flags that allow for cache locks to remain "persistent" across
355 * allocation and free calls. They may be used in combination.
356 */
357 #define MBP_PERSIST 0x1 /* Return with lock still held. */
358 #define MBP_PERSISTENT 0x2 /* Cache lock is already held coming in. */
359
360 /*
361 * Initialize the mbuf subsystem.
362 *
363 * We sub-divide the kmem_map into several submaps; this way, we don't have
364 * to worry about artificially limiting the number of mbuf or mbuf cluster
365 * allocations, due to fear of one type of allocation "stealing" address
366 * space initially reserved for another.
367 *
368 * Set up both the general containers and all the PCPU containers. Populate
369 * the PCPU containers with initial numbers.
370 */
371 MALLOC_DEFINE(M_MBUF, "mbufmgr", "mbuf subsystem management structures");
372 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
373 static void
374 mbuf_init(void *dummy)
375 {
376 struct mb_pcpu_list *pcpu_cnt;
377 vm_size_t mb_map_size;
378 int i, j;
379
380 /*
381 * Set up all the submaps, for each type of object that we deal
382 * with in this allocator.
383 */
384 mb_map_size = (vm_size_t)(nmbufs * MSIZE);
385 mb_map_size = rounddown(mb_map_size, MBUF_BUCK_SZ);
386 mb_list_mbuf.ml_btable = malloc((unsigned long)mb_map_size /
387 MBUF_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
388 if (mb_list_mbuf.ml_btable == NULL)
389 goto bad;
390 mb_list_mbuf.ml_map = kmem_suballoc(kmem_map,&(mb_list_mbuf.ml_mapbase),
391 &(mb_list_mbuf.ml_maptop), mb_map_size);
392 mb_list_mbuf.ml_map->system_map = 1;
393 mb_list_mbuf.ml_mapfull = 0;
394 mb_list_mbuf.ml_objsize = MSIZE;
395 mb_list_mbuf.ml_objbucks = MBUF_BUCK_SZ / mb_list_mbuf.ml_objsize;
396 mb_list_mbuf.ml_wmhigh = &mbuf_hiwm;
397 mb_list_mbuf.ml_wmlow = &mbuf_lowm;
398
399 mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
400 mb_map_size = rounddown(mb_map_size, CLUST_BUCK_SZ);
401 mb_list_clust.ml_btable = malloc((unsigned long)mb_map_size /
402 CLUST_BUCK_SZ * sizeof(struct mb_bucket *), M_MBUF, M_NOWAIT);
403 if (mb_list_clust.ml_btable == NULL)
404 goto bad;
405 mb_list_clust.ml_map = kmem_suballoc(kmem_map,
406 &(mb_list_clust.ml_mapbase), &(mb_list_clust.ml_maptop),
407 mb_map_size);
408 mb_list_clust.ml_map->system_map = 1;
409 mb_list_clust.ml_mapfull = 0;
410 mb_list_clust.ml_objsize = MCLBYTES;
411 mb_list_clust.ml_objbucks = CLUST_BUCK_SZ / mb_list_clust.ml_objsize;
412 mb_list_clust.ml_wmhigh = &clust_hiwm;
413 mb_list_clust.ml_wmlow = &clust_lowm;
414
415 /*
416 * Allocate required general (global) containers for each object type.
417 */
418 mb_list_mbuf.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
419 M_NOWAIT);
420 mb_list_clust.ml_genlist = malloc(sizeof(struct mb_gen_list), M_MBUF,
421 M_NOWAIT);
422 if ((mb_list_mbuf.ml_genlist == NULL) ||
423 (mb_list_clust.ml_genlist == NULL))
424 goto bad;
425
426 /*
427 * Initialize condition variables and general container mutex locks.
428 */
429 mtx_init(&mbuf_gen, "mbuf subsystem general lists lock", NULL, 0);
430 cv_init(&(mb_list_mbuf.ml_genlist->mgl_mstarved), "mbuf pool starved");
431 cv_init(&(mb_list_clust.ml_genlist->mgl_mstarved),
432 "mcluster pool starved");
433 mb_list_mbuf.ml_genlist->mb_cont.mc_lock =
434 mb_list_clust.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
435
436 /*
437 * Set up the general containers for each object.
438 */
439 mb_list_mbuf.ml_genlist->mb_cont.mc_numowner =
440 mb_list_clust.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
441 mb_list_mbuf.ml_genlist->mb_cont.mc_starved =
442 mb_list_clust.ml_genlist->mb_cont.mc_starved = 0;
443 mb_list_mbuf.ml_genlist->mb_cont.mc_objcount =
444 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbfree);
445 mb_list_clust.ml_genlist->mb_cont.mc_objcount =
446 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
447 mb_list_mbuf.ml_genlist->mb_cont.mc_numbucks =
448 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbbucks);
449 mb_list_clust.ml_genlist->mb_cont.mc_numbucks =
450 &(mb_statpcpu[MB_GENLIST_OWNER].mb_clbucks);
451 mb_list_mbuf.ml_genlist->mb_cont.mc_types =
452 &(mb_statpcpu[MB_GENLIST_OWNER].mb_mbtypes[0]);
453 mb_list_clust.ml_genlist->mb_cont.mc_types = NULL;
454 SLIST_INIT(&(mb_list_mbuf.ml_genlist->mb_cont.mc_bhead));
455 SLIST_INIT(&(mb_list_clust.ml_genlist->mb_cont.mc_bhead));
456
457 /*
458 * Allocate all the required counters for clusters. This makes
459 * cluster allocations/deallocations much faster.
460 */
461 cl_refcntmap = malloc(nmbclusters * sizeof(u_int), M_MBUF, M_NOWAIT);
462 if (cl_refcntmap == NULL)
463 goto bad;
464
465 /*
466 * Initialize general mbuf statistics.
467 */
468 mbstat.m_msize = mb_list_mbuf.ml_objsize;
469 mbstat.m_mclbytes = mb_list_clust.ml_objsize;
470 mbstat.m_minclsize = MINCLSIZE;
471 mbstat.m_mlen = MLEN;
472 mbstat.m_mhlen = MHLEN;
473 mbstat.m_numtypes = MT_NTYPES;
474 mbstat.m_mbperbuck = mb_list_mbuf.ml_objbucks;
475 mbstat.m_clperbuck = mb_list_clust.ml_objbucks;
476
477 /*
478 * Allocate and initialize PCPU containers.
479 */
480 for (i = 0; i < NCPU; i++) {
481 if (CPU_ABSENT(i)) {
482 mb_statpcpu[i].mb_active = 0;
483 continue;
484 }
485
486 mb_list_mbuf.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
487 M_MBUF, M_NOWAIT);
488 mb_list_clust.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
489 M_MBUF, M_NOWAIT);
490 if ((mb_list_mbuf.ml_cntlst[i] == NULL) ||
491 (mb_list_clust.ml_cntlst[i] == NULL))
492 goto bad;
493
494 mtx_init(&mbuf_pcpu[i], "mbuf PCPU list lock", NULL, 0);
495 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_lock =
496 mb_list_clust.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
497
498 mb_statpcpu[i].mb_active = 1;
499 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numowner =
500 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numowner = i;
501 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_starved =
502 mb_list_clust.ml_cntlst[i]->mb_cont.mc_starved = 0;
503 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_objcount =
504 &(mb_statpcpu[i].mb_mbfree);
505 mb_list_clust.ml_cntlst[i]->mb_cont.mc_objcount =
506 &(mb_statpcpu[i].mb_clfree);
507 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_numbucks =
508 &(mb_statpcpu[i].mb_mbbucks);
509 mb_list_clust.ml_cntlst[i]->mb_cont.mc_numbucks =
510 &(mb_statpcpu[i].mb_clbucks);
511 mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_types =
512 &(mb_statpcpu[i].mb_mbtypes[0]);
513 mb_list_clust.ml_cntlst[i]->mb_cont.mc_types = NULL;
514
515 SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
516 SLIST_INIT(&(mb_list_clust.ml_cntlst[i]->mb_cont.mc_bhead));
517
518 /*
519 * Perform initial allocations.
520 */
521 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_mbuf, i);
522 MB_LOCK_CONT(pcpu_cnt);
523 for (j = 0; j < NMB_MBUF_INIT; j++) {
524 if (mb_pop_cont(&mb_list_mbuf, M_DONTWAIT, pcpu_cnt)
525 == NULL)
526 goto bad;
527 }
528 MB_UNLOCK_CONT(pcpu_cnt);
529
530 pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_clust, i);
531 MB_LOCK_CONT(pcpu_cnt);
532 for (j = 0; j < NMB_CLUST_INIT; j++) {
533 if (mb_pop_cont(&mb_list_clust, M_DONTWAIT, pcpu_cnt)
534 == NULL)
535 goto bad;
536 }
537 MB_UNLOCK_CONT(pcpu_cnt);
538 }
539
540 return;
541 bad:
542 panic("mbuf_init(): failed to initialize mbuf subsystem!");
543 }
544
545 /*
546 * Populate a given mbuf PCPU container with a bucket full of fresh new
547 * buffers. Return a pointer to the new bucket (already in the container if
548 * successful), or return NULL on failure.
549 *
550 * LOCKING NOTES:
551 * PCPU container lock must be held when this is called.
552 * The lock is dropped here so that we can cleanly call the underlying VM
553 * code. If we fail, we return with no locks held. If we succeed (i.e., return
554 * non-NULL), we return with the PCPU lock held, ready for allocation from
555 * the returned bucket.
556 */
557 static struct mb_bucket *
558 mb_pop_cont(struct mb_lstmngr *mb_list, int how, struct mb_pcpu_list *cnt_lst)
559 {
560 struct mb_bucket *bucket;
561 caddr_t p;
562 int i;
563
564 MB_UNLOCK_CONT(cnt_lst);
565 /*
566 * If our object's (finite) map is starved now (i.e., no more address
567 * space), bail out now.
568 */
569 if (mb_list->ml_mapfull)
570 return (NULL);
571
572 bucket = malloc(sizeof(struct mb_bucket) +
573 mb_list->ml_objbucks * sizeof(void *), M_MBUF, MBTOM(how));
574 if (bucket == NULL)
575 return (NULL);
576
577 p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_objsize *
578 mb_list->ml_objbucks, MBTOM(how));
579 if (p == NULL) {
580 free(bucket, M_MBUF);
581 if (how == M_TRYWAIT)
582 mb_list->ml_mapfull = 1;
583 return (NULL);
584 }
585
586 bucket->mb_numfree = 0;
587 mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
588 for (i = 0; i < mb_list->ml_objbucks; i++) {
589 bucket->mb_free[i] = p;
590 bucket->mb_numfree++;
591 p += mb_list->ml_objsize;
592 }
593
594 MB_LOCK_CONT(cnt_lst);
595 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
596 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead), bucket, mb_blist);
597 (*(cnt_lst->mb_cont.mc_numbucks))++;
598 *(cnt_lst->mb_cont.mc_objcount) += bucket->mb_numfree;
599
600 return (bucket);
601 }
602
603 /*
604 * Allocate a network buffer.
605 * The general case is very easy. Complications only arise if our PCPU
606 * container is empty. Things get worse if the PCPU container is empty,
607 * the general container is empty, and we've run out of address space
608 * in our map; then we try to block if we're willing to (M_TRYWAIT).
609 */
610 static
611 void *
612 mb_alloc(struct mb_lstmngr *mb_list, int how, short type, short persist,
613 int *pers_list)
614 {
615 static int last_report;
616 struct mb_pcpu_list *cnt_lst;
617 struct mb_bucket *bucket;
618 void *m;
619
620 #ifdef INVARIANTS
621 int flags;
622
623 flags = how & (M_WAITOK | M_NOWAIT | M_DONTWAIT | M_TRYWAIT);
624 if (flags != M_DONTWAIT && flags != M_TRYWAIT) {
625 static struct timeval lasterr;
626 static int curerr;
627 if (ppsratecheck(&lasterr, &curerr, 1)) {
628 printf("Bad mbuf alloc flags: %x\n", flags);
629 backtrace();
630 how = M_TRYWAIT;
631 }
632 }
633 #endif
634
635 m = NULL;
636 if ((persist & MBP_PERSISTENT) != 0) {
637 /*
638 * If we're a "persistent" call, then the per-CPU #(pers_list)
639 * cache lock is already held, and we just need to refer to
640 * the correct cache descriptor.
641 */
642 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, *pers_list);
643 } else {
644 cnt_lst = MB_GET_PCPU_LIST(mb_list);
645 MB_LOCK_CONT(cnt_lst);
646 }
647
648 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) != NULL) {
649 /*
650 * This is the easy allocation case. We just grab an object
651 * from a bucket in the PCPU container. At worst, we
652 * have just emptied the bucket and so we remove it
653 * from the container.
654 */
655 MB_GET_OBJECT(m, bucket, cnt_lst);
656 MB_MBTYPES_INC(cnt_lst, type, 1);
657
658 /* If asked to persist, do not drop the lock. */
659 if ((persist & MBP_PERSIST) == 0)
660 MB_UNLOCK_CONT(cnt_lst);
661 else
662 *pers_list = cnt_lst->mb_cont.mc_numowner;
663 } else {
664 struct mb_gen_list *gen_list;
665
666 /*
667 * This is the less-common more difficult case. We must
668 * first verify if the general list has anything for us
669 * and if that also fails, we must allocate a page from
670 * the map and create a new bucket to place in our PCPU
671 * container (already locked). If the map is starved then
672 * we're really in for trouble, as we have to wait on
673 * the general container's condition variable.
674 */
675 gen_list = MB_GET_GEN_LIST(mb_list);
676 MB_LOCK_CONT(gen_list);
677
678 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead)))
679 != NULL) {
680 /*
681 * Give ownership of the bucket to our CPU's
682 * container, but only actually put the bucket
683 * in the container if it doesn't become free
684 * upon removing an mbuf from it.
685 */
686 SLIST_REMOVE_HEAD(&(gen_list->mb_cont.mc_bhead),
687 mb_blist);
688 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
689 (*(gen_list->mb_cont.mc_numbucks))--;
690 (*(cnt_lst->mb_cont.mc_numbucks))++;
691 *(gen_list->mb_cont.mc_objcount) -= bucket->mb_numfree;
692 bucket->mb_numfree--;
693 m = bucket->mb_free[(bucket->mb_numfree)];
694 if (bucket->mb_numfree == 0) {
695 SLIST_NEXT(bucket, mb_blist) = NULL;
696 bucket->mb_owner |= MB_BUCKET_FREE;
697 } else {
698 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
699 bucket, mb_blist);
700 *(cnt_lst->mb_cont.mc_objcount) +=
701 bucket->mb_numfree;
702 }
703 MB_UNLOCK_CONT(gen_list);
704 MB_MBTYPES_INC(cnt_lst, type, 1);
705
706 /* If asked to persist, do not drop the lock. */
707 if ((persist & MBP_PERSIST) == 0)
708 MB_UNLOCK_CONT(cnt_lst);
709 else
710 *pers_list = cnt_lst->mb_cont.mc_numowner;
711 } else {
712 /*
713 * We'll have to allocate a new page.
714 */
715 MB_UNLOCK_CONT(gen_list);
716 bucket = mb_pop_cont(mb_list, how, cnt_lst);
717 if (bucket != NULL) {
718 MB_GET_OBJECT(m, bucket, cnt_lst);
719 MB_MBTYPES_INC(cnt_lst, type, 1);
720
721 /* If asked to persist, do not drop the lock. */
722 if ((persist & MBP_PERSIST) == 0)
723 MB_UNLOCK_CONT(cnt_lst);
724 else
725 *pers_list=cnt_lst->mb_cont.mc_numowner;
726 } else {
727 if (how == M_TRYWAIT) {
728 /*
729 * Absolute worst-case scenario.
730 * We block if we're willing to, but
731 * only after trying to steal from
732 * other lists.
733 */
734 m = mb_alloc_wait(mb_list, type);
735 } else {
736 /* XXX: No consistency. */
737 mbstat.m_drops++;
738
739 if (ticks < last_report ||
740 (ticks - last_report) >= hz) {
741 last_report = ticks;
742 printf(
743 "All mbufs or mbuf clusters exhausted, please see tuning(7).\n");
744 }
745
746 }
747 if (m != NULL && (persist & MBP_PERSIST) != 0) {
748 cnt_lst = MB_GET_PCPU_LIST(mb_list);
749 MB_LOCK_CONT(cnt_lst);
750 *pers_list=cnt_lst->mb_cont.mc_numowner;
751 }
752 }
753 }
754 }
755
756 return (m);
757 }
758
759 /*
760 * This is the worst-case scenario called only if we're allocating with
761 * M_TRYWAIT. We first drain all the protocols, then try to find an mbuf
762 * by looking in every PCPU container. If we're still unsuccesful, we
763 * try the general container one last time and possibly block on our
764 * starved cv.
765 */
766 static void *
767 mb_alloc_wait(struct mb_lstmngr *mb_list, short type)
768 {
769 struct mb_pcpu_list *cnt_lst;
770 struct mb_gen_list *gen_list;
771 struct mb_bucket *bucket;
772 void *m;
773 int i, cv_ret;
774
775 /*
776 * Try to reclaim mbuf-related objects (mbufs, clusters).
777 */
778 mb_reclaim();
779
780 /*
781 * Cycle all the PCPU containers. Increment starved counts if found
782 * empty.
783 */
784 for (i = 0; i < NCPU; i++) {
785 if (CPU_ABSENT(i))
786 continue;
787 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, i);
788 MB_LOCK_CONT(cnt_lst);
789
790 /*
791 * If container is non-empty, get a single object from it.
792 * If empty, increment starved count.
793 */
794 if ((bucket = SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead))) !=
795 NULL) {
796 MB_GET_OBJECT(m, bucket, cnt_lst);
797 MB_MBTYPES_INC(cnt_lst, type, 1);
798 MB_UNLOCK_CONT(cnt_lst);
799 mbstat.m_wait++; /* XXX: No consistency. */
800 return (m);
801 } else
802 cnt_lst->mb_cont.mc_starved++;
803
804 MB_UNLOCK_CONT(cnt_lst);
805 }
806
807 /*
808 * We're still here, so that means it's time to get the general
809 * container lock, check it one more time (now that mb_reclaim()
810 * has been called) and if we still get nothing, block on the cv.
811 */
812 gen_list = MB_GET_GEN_LIST(mb_list);
813 MB_LOCK_CONT(gen_list);
814 if ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL) {
815 MB_GET_OBJECT(m, bucket, gen_list);
816 MB_MBTYPES_INC(gen_list, type, 1);
817 MB_UNLOCK_CONT(gen_list);
818 mbstat.m_wait++; /* XXX: No consistency. */
819 return (m);
820 }
821
822 gen_list->mb_cont.mc_starved++;
823 cv_ret = cv_timedwait(&(gen_list->mgl_mstarved),
824 gen_list->mb_cont.mc_lock, mbuf_wait);
825 gen_list->mb_cont.mc_starved--;
826
827 if ((cv_ret == 0) &&
828 ((bucket = SLIST_FIRST(&(gen_list->mb_cont.mc_bhead))) != NULL)) {
829 MB_GET_OBJECT(m, bucket, gen_list);
830 MB_MBTYPES_INC(gen_list, type, 1);
831 mbstat.m_wait++; /* XXX: No consistency. */
832 } else {
833 mbstat.m_drops++; /* XXX: No consistency. */
834 m = NULL;
835 }
836
837 MB_UNLOCK_CONT(gen_list);
838
839 return (m);
840 }
841
842 /*-
843 * Free an object to its rightful container.
844 * In the very general case, this operation is really very easy.
845 * Complications arise primarily if:
846 * (a) We've hit the high limit on number of free objects allowed in
847 * our PCPU container.
848 * (b) We're in a critical situation where our container has been
849 * marked 'starved' and we need to issue wakeups on the starved
850 * condition variable.
851 * (c) Minor (odd) cases: our bucket has migrated while we were
852 * waiting for the lock; our bucket is in the general container;
853 * our bucket is empty.
854 */
855 static
856 void
857 mb_free(struct mb_lstmngr *mb_list, void *m, short type, short persist,
858 int *pers_list)
859 {
860 struct mb_pcpu_list *cnt_lst;
861 struct mb_gen_list *gen_list;
862 struct mb_bucket *bucket;
863 u_int owner;
864
865 bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
866
867 /*
868 * Make sure that if after we lock the bucket's present container the
869 * bucket has migrated, that we drop the lock and get the new one.
870 */
871 retry_lock:
872 owner = bucket->mb_owner & ~MB_BUCKET_FREE;
873 switch (owner) {
874 case MB_GENLIST_OWNER:
875 gen_list = MB_GET_GEN_LIST(mb_list);
876 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
877 if (*pers_list != MB_GENLIST_OWNER) {
878 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
879 *pers_list);
880 MB_UNLOCK_CONT(cnt_lst);
881 MB_LOCK_CONT(gen_list);
882 }
883 } else {
884 MB_LOCK_CONT(gen_list);
885 }
886 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
887 MB_UNLOCK_CONT(gen_list);
888 *pers_list = -1;
889 goto retry_lock;
890 }
891
892 /*
893 * If we're intended for the general container, this is
894 * real easy: no migrating required. The only `bogon'
895 * is that we're now contending with all the threads
896 * dealing with the general list, but this is expected.
897 */
898 MB_PUT_OBJECT(m, bucket, gen_list);
899 MB_MBTYPES_DEC(gen_list, type, 1);
900 if (bucket->mb_owner & MB_BUCKET_FREE) {
901 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
902 bucket, mb_blist);
903 bucket->mb_owner = MB_GENLIST_OWNER;
904 }
905 if (gen_list->mb_cont.mc_starved > 0)
906 cv_signal(&(gen_list->mgl_mstarved));
907 if ((persist & MBP_PERSIST) == 0)
908 MB_UNLOCK_CONT(gen_list);
909 else
910 *pers_list = MB_GENLIST_OWNER;
911 break;
912
913 default:
914 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list, owner);
915 if (((persist & MBP_PERSISTENT) != 0) && (*pers_list >= 0)) {
916 if (*pers_list == MB_GENLIST_OWNER) {
917 gen_list = MB_GET_GEN_LIST(mb_list);
918 MB_UNLOCK_CONT(gen_list);
919 MB_LOCK_CONT(cnt_lst);
920 } else {
921 cnt_lst = MB_GET_PCPU_LIST_NUM(mb_list,
922 *pers_list);
923 owner = *pers_list;
924 }
925 } else {
926 MB_LOCK_CONT(cnt_lst);
927 }
928 if (owner != (bucket->mb_owner & ~MB_BUCKET_FREE)) {
929 MB_UNLOCK_CONT(cnt_lst);
930 *pers_list = -1;
931 goto retry_lock;
932 }
933
934 MB_PUT_OBJECT(m, bucket, cnt_lst);
935 MB_MBTYPES_DEC(cnt_lst, type, 1);
936 if ((*(cnt_lst->mb_cont.mc_objcount) > *(mb_list->ml_wmhigh)) ||
937 (cnt_lst->mb_cont.mc_starved > 0)) {
938 /*
939 * We've hit the high limit of allowed numbers of mbufs
940 * on this PCPU list or we've been flagged that we need
941 * to transfer a bucket over to the general cache.
942 * We must now migrate a bucket over to the general
943 * container.
944 */
945 gen_list = MB_GET_GEN_LIST(mb_list);
946 MB_LOCK_CONT(gen_list);
947 if ((bucket->mb_owner & MB_BUCKET_FREE) == 0) {
948 bucket =
949 SLIST_FIRST(&(cnt_lst->mb_cont.mc_bhead));
950 SLIST_REMOVE_HEAD(&(cnt_lst->mb_cont.mc_bhead),
951 mb_blist);
952 }
953 SLIST_INSERT_HEAD(&(gen_list->mb_cont.mc_bhead),
954 bucket, mb_blist);
955 bucket->mb_owner = MB_GENLIST_OWNER;
956 *(cnt_lst->mb_cont.mc_objcount) -= bucket->mb_numfree;
957 *(gen_list->mb_cont.mc_objcount) += bucket->mb_numfree;
958 (*(cnt_lst->mb_cont.mc_numbucks))--;
959 (*(gen_list->mb_cont.mc_numbucks))++;
960
961 /*
962 * While we're at it, transfer some of the mbtypes
963 * "count load" onto the general list's mbtypes
964 * array, seeing as how we're moving the bucket
965 * there now, meaning that the freeing of objects
966 * there will now decrement the _general list's_
967 * mbtypes counters, and no longer our PCPU list's
968 * mbtypes counters. We do this for the type presently
969 * being freed in an effort to keep the mbtypes
970 * counters approximately balanced across all lists.
971 */
972 MB_MBTYPES_DEC(cnt_lst, type,
973 mb_list->ml_objbucks - bucket->mb_numfree);
974 MB_MBTYPES_INC(gen_list, type,
975 mb_list->ml_objbucks - bucket->mb_numfree);
976
977 if (cnt_lst->mb_cont.mc_starved > 0) {
978 /*
979 * Determine whether or not to keep
980 * transferring buckets to the general list
981 * or whether we've transferred enough already.
982 * The thread that is blocked may end up waking
983 * up in the meantime, but transferring an
984 * extra bucket in a constrained situation
985 * is not so bad, as we're likely to need
986 * it soon anyway.
987 */
988 if (gen_list->mb_cont.mc_starved > 0) {
989 cnt_lst->mb_cont.mc_starved--;
990 cv_signal(&(gen_list->mgl_mstarved));
991 } else
992 cnt_lst->mb_cont.mc_starved = 0;
993 }
994 MB_UNLOCK_CONT(gen_list);
995 if ((persist & MBP_PERSIST) == 0)
996 MB_UNLOCK_CONT(cnt_lst);
997 else
998 *pers_list = owner;
999 break;
1000 }
1001
1002 if (bucket->mb_owner & MB_BUCKET_FREE) {
1003 SLIST_INSERT_HEAD(&(cnt_lst->mb_cont.mc_bhead),
1004 bucket, mb_blist);
1005 bucket->mb_owner = cnt_lst->mb_cont.mc_numowner;
1006 }
1007
1008 if ((persist & MBP_PERSIST) == 0)
1009 MB_UNLOCK_CONT(cnt_lst);
1010 else
1011 *pers_list = owner;
1012 break;
1013 }
1014 }
1015
1016 /*
1017 * Drain protocols in hopes to free up some resources.
1018 *
1019 * LOCKING NOTES:
1020 * No locks should be held when this is called. The drain routines have to
1021 * presently acquire some locks which raises the possibility of lock order
1022 * violation if we're holding any mutex if that mutex is acquired in reverse
1023 * order relative to one of the locks in the drain routines.
1024 */
1025 static void
1026 mb_reclaim(void)
1027 {
1028 struct domain *dp;
1029 struct protosw *pr;
1030
1031 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
1032 "mb_reclaim()");
1033
1034 mbstat.m_drain++; /* XXX: No consistency. */
1035
1036 for (dp = domains; dp != NULL; dp = dp->dom_next)
1037 for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
1038 if (pr->pr_drain != NULL)
1039 (*pr->pr_drain)();
1040 }
1041
1042 /******************************************************************************
1043 * Internal setup macros.
1044 */
1045
1046 #define _mb_setup(m, type) do { \
1047 (m)->m_type = (type); \
1048 (m)->m_next = NULL; \
1049 (m)->m_nextpkt = NULL; \
1050 (m)->m_data = (m)->m_dat; \
1051 (m)->m_flags = 0; \
1052 } while (0)
1053
1054 #define _mbhdr_setup(m, type) do { \
1055 (m)->m_type = (type); \
1056 (m)->m_next = NULL; \
1057 (m)->m_nextpkt = NULL; \
1058 (m)->m_data = (m)->m_pktdat; \
1059 (m)->m_flags = M_PKTHDR; \
1060 (m)->m_pkthdr.rcvif = NULL; \
1061 (m)->m_pkthdr.csum_flags = 0; \
1062 SLIST_INIT(&(m)->m_pkthdr.tags); \
1063 } while (0)
1064
1065 #define _mcl_setup(m) do { \
1066 (m)->m_data = (m)->m_ext.ext_buf; \
1067 (m)->m_flags |= M_EXT; \
1068 (m)->m_ext.ext_free = NULL; \
1069 (m)->m_ext.ext_args = NULL; \
1070 (m)->m_ext.ext_size = MCLBYTES; \
1071 (m)->m_ext.ext_type = EXT_CLUSTER; \
1072 } while (0)
1073
1074 #define _mext_init_ref(m, ref) do { \
1075 (m)->m_ext.ref_cnt = ((ref) == NULL) ? \
1076 malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)(ref); \
1077 if ((m)->m_ext.ref_cnt != NULL) { \
1078 *((m)->m_ext.ref_cnt) = 0; \
1079 MEXT_ADD_REF((m)); \
1080 } \
1081 } while (0)
1082
1083 #define cl2ref(cl) \
1084 (((uintptr_t)(cl) - (uintptr_t)mb_list_clust.ml_mapbase) >> MCLSHIFT)
1085
1086 #define _mext_dealloc_ref(m) \
1087 if ((m)->m_ext.ext_type != EXT_EXTREF) \
1088 free((m)->m_ext.ref_cnt, M_MBUF)
1089
1090 /******************************************************************************
1091 * Internal routines.
1092 *
1093 * Because mb_alloc() and mb_free() are inlines (to keep the common
1094 * cases down to a maximum of one function call), below are a few
1095 * routines used only internally for the sole purpose of making certain
1096 * functions smaller.
1097 *
1098 * - _mext_free(): frees associated storage when the ref. count is
1099 * exactly one and we're freeing.
1100 *
1101 * - _mgetm_internal(): common "persistent-lock" routine that allocates
1102 * an mbuf and a cluster in one shot, but where the lock is already
1103 * held coming in (which is what makes it different from the exported
1104 * m_getcl()). The lock is dropped when done. This is used by m_getm()
1105 * and, therefore, is very m_getm()-specific.
1106 */
1107 static struct mbuf *_mgetm_internal(int, short, short, int);
1108
1109 void
1110 _mext_free(struct mbuf *mb)
1111 {
1112
1113 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1114 mb_free(&mb_list_clust, (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1115 0, NULL);
1116 } else {
1117 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf, mb->m_ext.ext_args);
1118 _mext_dealloc_ref(mb);
1119 }
1120 }
1121
1122 static struct mbuf *
1123 _mgetm_internal(int how, short type, short persist, int cchnum)
1124 {
1125 struct mbuf *mb;
1126
1127 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, persist,&cchnum);
1128 if (mb == NULL)
1129 return NULL;
1130 _mb_setup(mb, type);
1131
1132 if ((persist & MBP_PERSIST) != 0) {
1133 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1134 how, MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1135 if (mb->m_ext.ext_buf == NULL) {
1136 (void)m_free(mb);
1137 mb = NULL;
1138 }
1139 _mcl_setup(mb);
1140 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1141 }
1142 return (mb);
1143 }
1144
1145 /******************************************************************************
1146 * Exported buffer allocation and de-allocation routines.
1147 */
1148
1149 /*
1150 * Allocate and return a single (normal) mbuf. NULL is returned on failure.
1151 *
1152 * Arguments:
1153 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1154 * if really starved for memory. M_DONTWAIT to never block.
1155 * - type: the type of the mbuf being allocated.
1156 */
1157 struct mbuf *
1158 m_get(int how, short type)
1159 {
1160 struct mbuf *mb;
1161
1162 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1163 if (mb != NULL)
1164 _mb_setup(mb, type);
1165 return (mb);
1166 }
1167
1168 /*
1169 * Allocate a given length worth of mbufs and/or clusters (whatever fits
1170 * best) and return a pointer to the top of the allocated chain. If an
1171 * existing mbuf chain is provided, then we will append the new chain
1172 * to the existing one but still return the top of the newly allocated
1173 * chain. NULL is returned on failure, in which case the [optional]
1174 * provided chain is left untouched, and any memory already allocated
1175 * is freed.
1176 *
1177 * Arguments:
1178 * - m: existing chain to which to append new chain (optional).
1179 * - len: total length of data to append, either in mbufs or clusters
1180 * (we allocate whatever combination yields the best fit).
1181 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1182 * if really starved for memory. M_DONTWAIT to never block.
1183 * - type: the type of the mbuf being allocated.
1184 */
1185 struct mbuf *
1186 m_getm(struct mbuf *m, int len, int how, short type)
1187 {
1188 struct mbuf *mb, *top, *cur, *mtail;
1189 int num, rem, cchnum;
1190 short persist;
1191 int i;
1192
1193 KASSERT(len >= 0, ("m_getm(): len is < 0"));
1194
1195 /* If m != NULL, we will append to the end of that chain. */
1196 if (m != NULL)
1197 for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
1198 else
1199 mtail = NULL;
1200
1201 /*
1202 * In the best-case scenario (which should be the common case
1203 * unless we're in a starvation situation), we will be able to
1204 * go through the allocation of all the desired mbufs and clusters
1205 * here without dropping our per-CPU cache lock in between.
1206 */
1207 num = len / MCLBYTES;
1208 rem = len % MCLBYTES;
1209 persist = 0;
1210 cchnum = -1;
1211 top = cur = NULL;
1212 for (i = 0; i < num; i++) {
1213 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1214 MBP_PERSIST | persist, &cchnum);
1215 if (mb == NULL)
1216 goto failed;
1217 _mb_setup(mb, type);
1218 mb->m_len = 0;
1219
1220 persist = (i != (num - 1) || rem > 0) ? MBP_PERSIST : 0;
1221 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust,
1222 how, MT_NOTMBUF, persist | MBP_PERSISTENT, &cchnum);
1223 if (mb->m_ext.ext_buf == NULL) {
1224 (void)m_free(mb);
1225 goto failed;
1226 }
1227 _mcl_setup(mb);
1228 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1229 persist = MBP_PERSISTENT;
1230
1231 if (cur == NULL)
1232 top = cur = mb;
1233 else
1234 cur = (cur->m_next = mb);
1235 }
1236 if (rem > 0) {
1237 if (cchnum >= 0) {
1238 persist = MBP_PERSISTENT;
1239 persist |= (rem > MINCLSIZE) ? MBP_PERSIST : 0;
1240 mb = _mgetm_internal(how, type, persist, cchnum);
1241 if (mb == NULL)
1242 goto failed;
1243 } else if (rem > MINCLSIZE) {
1244 mb = m_getcl(how, type, 0);
1245 } else {
1246 mb = m_get(how, type);
1247 }
1248 if (mb != NULL) {
1249 mb->m_len = 0;
1250 if (cur == NULL)
1251 top = mb;
1252 else
1253 cur->m_next = mb;
1254 } else
1255 goto failed;
1256 }
1257
1258 if (mtail != NULL)
1259 mtail->m_next = top;
1260 return top;
1261 failed:
1262 if (top != NULL)
1263 m_freem(top);
1264 return NULL;
1265 }
1266
1267 /*
1268 * Allocate and return a single M_PKTHDR mbuf. NULL is returned on failure.
1269 *
1270 * Arguments:
1271 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1272 * if really starved for memory. M_DONTWAIT to never block.
1273 * - type: the type of the mbuf being allocated.
1274 */
1275 struct mbuf *
1276 m_gethdr(int how, short type)
1277 {
1278 struct mbuf *mb;
1279
1280 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1281 if (mb != NULL) {
1282 _mbhdr_setup(mb, type);
1283 #ifdef MAC
1284 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1285 m_free(mb);
1286 return (NULL);
1287 }
1288 #endif
1289 }
1290 return (mb);
1291 }
1292
1293 /*
1294 * Allocate and return a single (normal) pre-zero'd mbuf. NULL is
1295 * returned on failure.
1296 *
1297 * Arguments:
1298 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1299 * if really starved for memory. M_DONTWAIT to never block.
1300 * - type: the type of the mbuf being allocated.
1301 */
1302 struct mbuf *
1303 m_get_clrd(int how, short type)
1304 {
1305 struct mbuf *mb;
1306
1307 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1308 if (mb != NULL) {
1309 _mb_setup(mb, type);
1310 bzero(mtod(mb, caddr_t), MLEN);
1311 }
1312 return (mb);
1313 }
1314
1315 /*
1316 * Allocate and return a single M_PKTHDR pre-zero'd mbuf. NULL is
1317 * returned on failure.
1318 *
1319 * Arguments:
1320 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1321 * if really starved for memory. M_DONTWAIT to never block.
1322 * - type: the type of the mbuf being allocated.
1323 */
1324 struct mbuf *
1325 m_gethdr_clrd(int how, short type)
1326 {
1327 struct mbuf *mb;
1328
1329 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type, 0, NULL);
1330 if (mb != NULL) {
1331 _mbhdr_setup(mb, type);
1332 #ifdef MAC
1333 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1334 m_free(mb);
1335 return (NULL);
1336 }
1337 #endif
1338 bzero(mtod(mb, caddr_t), MHLEN);
1339 }
1340 return (mb);
1341 }
1342
1343 /*
1344 * Free a single mbuf and any associated storage that it may have attached
1345 * to it. The associated storage may not be immediately freed if its
1346 * reference count is above 1. Returns the next mbuf in the chain following
1347 * the mbuf being freed.
1348 *
1349 * Arguments:
1350 * - mb: the mbuf to free.
1351 */
1352 struct mbuf *
1353 m_free(struct mbuf *mb)
1354 {
1355 struct mbuf *nb;
1356 int cchnum;
1357 short persist = 0;
1358
1359 #ifdef INVARIANTS
1360 if (mb->m_flags & M_FREELIST)
1361 panic("m_free detected a mbuf double-free");
1362 mb->m_flags |= M_FREELIST;
1363 #endif
1364 if ((mb->m_flags & M_PKTHDR) != 0)
1365 m_tag_delete_chain(mb, NULL);
1366 nb = mb->m_next;
1367 if ((mb->m_flags & M_EXT) != 0) {
1368 MEXT_REM_REF(mb);
1369 if (atomic_cmpset_int(mb->m_ext.ref_cnt, 0, 1)) {
1370 if (mb->m_ext.ext_type == EXT_CLUSTER) {
1371 mb_free(&mb_list_clust,
1372 (caddr_t)mb->m_ext.ext_buf, MT_NOTMBUF,
1373 MBP_PERSIST, &cchnum);
1374 persist = MBP_PERSISTENT;
1375 } else {
1376 (*(mb->m_ext.ext_free))(mb->m_ext.ext_buf,
1377 mb->m_ext.ext_args);
1378 _mext_dealloc_ref(mb);
1379 persist = 0;
1380 }
1381 }
1382 }
1383 mb_free(&mb_list_mbuf, mb, mb->m_type, persist, &cchnum);
1384 return (nb);
1385 }
1386
1387 /*
1388 * Free an entire chain of mbufs and associated external buffers, if
1389 * applicable. Right now, we only optimize a little so that the cache
1390 * lock may be held across a single mbuf+cluster free. Hopefully,
1391 * we'll eventually be holding the lock across more than merely two
1392 * consecutive frees but right now this is hard to implement because of
1393 * things like _mext_dealloc_ref (may do a free()) and atomic ops in the
1394 * loop.
1395 *
1396 * - mb: the mbuf chain to free.
1397 */
1398 void
1399 m_freem(struct mbuf *mb)
1400 {
1401
1402 while (mb != NULL)
1403 mb = m_free(mb);
1404 }
1405
1406 /*
1407 * Fetch an mbuf with a cluster attached to it. If one of the
1408 * allocations fails, the entire allocation fails. This routine is
1409 * the preferred way of fetching both the mbuf and cluster together,
1410 * as it avoids having to unlock/relock between allocations. Returns
1411 * NULL on failure.
1412 *
1413 * Arguments:
1414 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1415 * if really starved for memory. M_DONTWAIT to never block.
1416 * - type: the type of the mbuf being allocated.
1417 * - flags: any flags to pass to the mbuf being allocated; if this includes
1418 * the M_PKTHDR bit, then the mbuf is configured as a M_PKTHDR mbuf.
1419 */
1420 struct mbuf *
1421 m_getcl(int how, short type, int flags)
1422 {
1423 struct mbuf *mb;
1424 int cchnum;
1425
1426 mb = (struct mbuf *)mb_alloc(&mb_list_mbuf, how, type,
1427 MBP_PERSIST, &cchnum);
1428 if (mb == NULL)
1429 return NULL;
1430 mb->m_type = type;
1431 mb->m_next = NULL;
1432 mb->m_flags = flags;
1433 if ((flags & M_PKTHDR) != 0) {
1434 mb->m_nextpkt = NULL;
1435 mb->m_pkthdr.rcvif = NULL;
1436 mb->m_pkthdr.csum_flags = 0;
1437 SLIST_INIT(&mb->m_pkthdr.tags);
1438 }
1439
1440 mb->m_ext.ext_buf = (caddr_t)mb_alloc(&mb_list_clust, how,
1441 MT_NOTMBUF, MBP_PERSISTENT, &cchnum);
1442 if (mb->m_ext.ext_buf == NULL) {
1443 (void)m_free(mb);
1444 mb = NULL;
1445 } else {
1446 _mcl_setup(mb);
1447 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1448 }
1449 #ifdef MAC
1450 if (flags & M_PKTHDR) {
1451 if (mac_init_mbuf(mb, MBTOM(how)) != 0) {
1452 m_free(mb);
1453 return (NULL);
1454 }
1455 }
1456 #endif
1457 return (mb);
1458 }
1459
1460 /*
1461 * Fetch a single mbuf cluster and attach it to an existing mbuf. If
1462 * successfull, configures the provided mbuf to have mbuf->m_ext.ext_buf
1463 * pointing to the cluster, and sets the M_EXT bit in the mbuf's flags.
1464 * The M_EXT bit is not set on failure.
1465 *
1466 * Arguments:
1467 * - mb: the existing mbuf to which to attach the allocated cluster.
1468 * - how: M_TRYWAIT to try to block for kern.ipc.mbuf_wait number of ticks
1469 * if really starved for memory. M_DONTWAIT to never block.
1470 */
1471 void
1472 m_clget(struct mbuf *mb, int how)
1473 {
1474
1475 mb->m_ext.ext_buf= (caddr_t)mb_alloc(&mb_list_clust,how,MT_NOTMBUF,
1476 0, NULL);
1477 if (mb->m_ext.ext_buf != NULL) {
1478 _mcl_setup(mb);
1479 _mext_init_ref(mb, &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)]);
1480 }
1481 }
1482
1483 /*
1484 * Configure a provided mbuf to refer to the provided external storage
1485 * buffer and setup a reference count for said buffer. If the setting
1486 * up of the reference count fails, the M_EXT bit will not be set. If
1487 * successfull, the M_EXT bit is set in the mbuf's flags.
1488 *
1489 * Arguments:
1490 * - mb: the existing mbuf to which to attach the provided buffer.
1491 * - buf: the address of the provided external storage buffer.
1492 * - size: the size of the provided buffer.
1493 * - freef: a pointer to a routine that is responsible for freeing the
1494 * provided external storage buffer.
1495 * - args: a pointer to an argument structure (of any type) to be passed
1496 * to the provided freef routine (may be NULL).
1497 * - flags: any other flags to be passed to the provided mbuf.
1498 * - type: the type that the external storage buffer should be labeled with.
1499 */
1500 void
1501 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
1502 void (*freef)(void *, void *), void *args, int flags, int type)
1503 {
1504 u_int *ref_cnt = NULL;
1505
1506 if (type == EXT_CLUSTER)
1507 ref_cnt = &cl_refcntmap[cl2ref(mb->m_ext.ext_buf)];
1508 else if (type == EXT_EXTREF)
1509 ref_cnt = mb->m_ext.ref_cnt;
1510 _mext_init_ref(mb, ref_cnt);
1511 if (mb->m_ext.ref_cnt != NULL) {
1512 mb->m_flags |= (M_EXT | flags);
1513 mb->m_ext.ext_buf = buf;
1514 mb->m_data = mb->m_ext.ext_buf;
1515 mb->m_ext.ext_size = size;
1516 mb->m_ext.ext_free = freef;
1517 mb->m_ext.ext_args = args;
1518 mb->m_ext.ext_type = type;
1519 }
1520 }
1521
1522 /*
1523 * Change type of provided mbuf. This is a relatively expensive operation
1524 * (due to the cost of statistics manipulations) and should be avoided, where
1525 * possible.
1526 *
1527 * Arguments:
1528 * - mb: the provided mbuf for which the type needs to be changed.
1529 * - new_type: the new type to change the mbuf to.
1530 */
1531 void
1532 m_chtype(struct mbuf *mb, short new_type)
1533 {
1534 struct mb_gen_list *gen_list;
1535
1536 gen_list = MB_GET_GEN_LIST(&mb_list_mbuf);
1537 MB_LOCK_CONT(gen_list);
1538 MB_MBTYPES_DEC(gen_list, mb->m_type, 1);
1539 MB_MBTYPES_INC(gen_list, new_type, 1);
1540 MB_UNLOCK_CONT(gen_list);
1541 mb->m_type = new_type;
1542 }
Cache object: 458d69d4bc8a70b23b9641021e707bce
|