FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_blist.c
1 /*-
2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
5 * are met:
6 * 1. Redistributions of source code must retain the above copyright
7 * notice, this list of conditions and the following disclaimer.
8 * 2. Redistributions in binary form must reproduce the above copyright
9 * notice, this list of conditions and the following disclaimer in the
10 * documentation and/or other materials provided with the distribution.
11 * 4. Neither the name of the University nor the names of its contributors
12 * may be used to endorse or promote products derived from this software
13 * without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 /*
28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
29 *
30 * This module implements a general bitmap allocator/deallocator. The
31 * allocator eats around 2 bits per 'block'. The module does not
32 * try to interpret the meaning of a 'block' other than to return
33 * SWAPBLK_NONE on an allocation failure.
34 *
35 * A radix tree controls access to pieces of the bitmap, and includes
36 * auxiliary information at each interior node about the availabilty of
37 * contiguous free blocks in the subtree rooted at that node. Two radix
38 * constants are involved: one for the size of the bitmaps contained in the
39 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
41 * associated with a range of blocks. The root of any subtree stores a
42 * hint field that defines an upper bound on the size of the largest
43 * allocation that can begin in the associated block range. A hint is an
44 * upper bound on a potential allocation, but not necessarily a tight upper
45 * bound.
46 *
47 * The radix tree also implements two collapsed states for meta nodes:
48 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
49 * in either of these two states, all information contained underneath
50 * the node is considered stale. These states are used to optimize
51 * allocation and freeing operations.
52 *
53 * The hinting greatly increases code efficiency for allocations while
54 * the general radix structure optimizes both allocations and frees. The
55 * radix tree should be able to operate well no matter how much
56 * fragmentation there is and no matter how large a bitmap is used.
57 *
58 * The blist code wires all necessary memory at creation time. Neither
59 * allocations nor frees require interaction with the memory subsystem.
60 * The non-blocking features of the blist code are used in the swap code
61 * (vm/swap_pager.c).
62 *
63 * LAYOUT: The radix tree is laid out recursively using a
64 * linear array. Each meta node is immediately followed (laid out
65 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
66 * is a recursive structure but one that can be easily scanned through
67 * a very simple 'skip' calculation. In order to support large radixes,
68 * portions of the tree may reside outside our memory allocation. We
69 * handle this with an early-termination optimization (when bighint is
70 * set to -1) on the scan. The memory allocation is only large enough
71 * to cover the number of blocks requested at creation time even if it
72 * must be encompassed in larger root-node radix.
73 *
74 * NOTE: the allocator cannot currently allocate more than
75 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
76 * large' if you try. This is an area that could use improvement. The
77 * radix is large enough that this restriction does not effect the swap
78 * system, though. Currently only the allocation code is affected by
79 * this algorithmic unfeature. The freeing code can handle arbitrary
80 * ranges.
81 *
82 * This code can be compiled stand-alone for debugging.
83 */
84
85 #include <sys/cdefs.h>
86 __FBSDID("$FreeBSD: releng/11.2/sys/kern/subr_blist.c 331722 2018-03-29 02:50:57Z eadler $");
87
88 #ifdef _KERNEL
89
90 #include <sys/param.h>
91 #include <sys/systm.h>
92 #include <sys/lock.h>
93 #include <sys/kernel.h>
94 #include <sys/blist.h>
95 #include <sys/malloc.h>
96 #include <sys/sbuf.h>
97 #include <sys/proc.h>
98 #include <sys/mutex.h>
99
100 #else
101
102 #ifndef BLIST_NO_DEBUG
103 #define BLIST_DEBUG
104 #endif
105
106 #include <sys/types.h>
107 #include <sys/malloc.h>
108 #include <sys/sbuf.h>
109 #include <stdio.h>
110 #include <string.h>
111 #include <stdlib.h>
112 #include <stdarg.h>
113 #include <stdbool.h>
114
115 #define bitcount64(x) __bitcount64((uint64_t)(x))
116 #define malloc(a,b,c) calloc(a, 1)
117 #define free(a,b) free(a)
118 static __inline int imax(int a, int b) { return (a > b ? a : b); }
119
120 #include <sys/blist.h>
121
122 void panic(const char *ctl, ...);
123
124 #endif
125
126 /*
127 * static support functions
128 */
129 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
130 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
131 u_daddr_t radix);
132 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
133 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
134 u_daddr_t radix);
135 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
136 blist_t dest, daddr_t count);
137 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
138 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
139 u_daddr_t radix);
140 static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
141 #ifndef _KERNEL
142 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
143 int tab);
144 #endif
145
146 #ifdef _KERNEL
147 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
148 #endif
149
150 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151 "radix divisibility error");
152 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
153 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
154
155 /*
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees. Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168 * so that simple integer division by a constant can safely be used for the
169 * calculation.
170 */
171 static inline daddr_t
172 radix_to_skip(daddr_t radix)
173 {
174
175 return (radix /
176 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
177 }
178
179 /*
180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181 * Assumes that the argument has only one bit set.
182 */
183 static inline int
184 bitpos(u_daddr_t mask)
185 {
186 int hi, lo, mid;
187
188 switch (sizeof(mask)) {
189 #ifdef HAVE_INLINE_FFSLL
190 case sizeof(long long):
191 return (ffsll(mask) - 1);
192 #endif
193 default:
194 lo = 0;
195 hi = BLIST_BMAP_RADIX;
196 while (lo + 1 < hi) {
197 mid = (lo + hi) >> 1;
198 if ((mask >> mid) != 0)
199 lo = mid;
200 else
201 hi = mid;
202 }
203 return (lo);
204 }
205 }
206
207 /*
208 * blist_create() - create a blist capable of handling up to the specified
209 * number of blocks
210 *
211 * blocks - must be greater than 0
212 * flags - malloc flags
213 *
214 * The smallest blist consists of a single leaf node capable of
215 * managing BLIST_BMAP_RADIX blocks.
216 */
217 blist_t
218 blist_create(daddr_t blocks, int flags)
219 {
220 blist_t bl;
221 daddr_t nodes, radix;
222
223 /*
224 * Calculate the radix field used for scanning.
225 */
226 radix = BLIST_BMAP_RADIX;
227 while (radix < blocks) {
228 radix *= BLIST_META_RADIX;
229 }
230 nodes = 1 + blst_radix_init(NULL, radix, blocks);
231
232 bl = malloc(sizeof(struct blist), M_SWAP, flags);
233 if (bl == NULL)
234 return (NULL);
235
236 bl->bl_blocks = blocks;
237 bl->bl_radix = radix;
238 bl->bl_cursor = 0;
239 bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
240 if (bl->bl_root == NULL) {
241 free(bl, M_SWAP);
242 return (NULL);
243 }
244 blst_radix_init(bl->bl_root, radix, blocks);
245
246 #if defined(BLIST_DEBUG)
247 printf(
248 "BLIST representing %lld blocks (%lld MB of swap)"
249 ", requiring %lldK of ram\n",
250 (long long)bl->bl_blocks,
251 (long long)bl->bl_blocks * 4 / 1024,
252 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
253 );
254 printf("BLIST raw radix tree contains %lld records\n",
255 (long long)nodes);
256 #endif
257
258 return (bl);
259 }
260
261 void
262 blist_destroy(blist_t bl)
263 {
264 free(bl->bl_root, M_SWAP);
265 free(bl, M_SWAP);
266 }
267
268 /*
269 * blist_alloc() - reserve space in the block bitmap. Return the base
270 * of a contiguous region or SWAPBLK_NONE if space could
271 * not be allocated.
272 */
273 daddr_t
274 blist_alloc(blist_t bl, daddr_t count)
275 {
276 daddr_t blk;
277
278 /*
279 * This loop iterates at most twice. An allocation failure in the
280 * first iteration leads to a second iteration only if the cursor was
281 * non-zero. When the cursor is zero, an allocation failure will
282 * reduce the hint, stopping further iterations.
283 */
284 while (count <= bl->bl_root->bm_bighint) {
285 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
286 bl->bl_radix);
287 if (blk != SWAPBLK_NONE) {
288 bl->bl_cursor = blk + count;
289 if (bl->bl_cursor == bl->bl_blocks)
290 bl->bl_cursor = 0;
291 return (blk);
292 } else if (bl->bl_cursor != 0)
293 bl->bl_cursor = 0;
294 }
295 return (SWAPBLK_NONE);
296 }
297
298 /*
299 * blist_avail() - return the number of free blocks.
300 */
301 daddr_t
302 blist_avail(blist_t bl)
303 {
304
305 if (bl->bl_radix == BLIST_BMAP_RADIX)
306 return (bitcount64(bl->bl_root->u.bmu_bitmap));
307 else
308 return (bl->bl_root->u.bmu_avail);
309 }
310
311 /*
312 * blist_free() - free up space in the block bitmap. Return the base
313 * of a contiguous region. Panic if an inconsistancy is
314 * found.
315 */
316 void
317 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
318 {
319
320 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
321 }
322
323 /*
324 * blist_fill() - mark a region in the block bitmap as off-limits
325 * to the allocator (i.e. allocate it), ignoring any
326 * existing allocations. Return the number of blocks
327 * actually filled that were free before the call.
328 */
329 daddr_t
330 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
331 {
332
333 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
334 }
335
336 /*
337 * blist_resize() - resize an existing radix tree to handle the
338 * specified number of blocks. This will reallocate
339 * the tree and transfer the previous bitmap to the new
340 * one. When extending the tree you can specify whether
341 * the new blocks are to left allocated or freed.
342 */
343 void
344 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
345 {
346 blist_t newbl = blist_create(count, flags);
347 blist_t save = *pbl;
348
349 *pbl = newbl;
350 if (count > save->bl_blocks)
351 count = save->bl_blocks;
352 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
353
354 /*
355 * If resizing upwards, should we free the new space or not?
356 */
357 if (freenew && count < newbl->bl_blocks) {
358 blist_free(newbl, count, newbl->bl_blocks - count);
359 }
360 blist_destroy(save);
361 }
362
363 #ifdef BLIST_DEBUG
364
365 /*
366 * blist_print() - dump radix tree
367 */
368 void
369 blist_print(blist_t bl)
370 {
371 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
372 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
373 printf("}\n");
374 }
375
376 #endif
377
378 static const u_daddr_t fib[] = {
379 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
380 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
381 514229, 832040, 1346269, 2178309, 3524578,
382 };
383
384 /*
385 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
386 */
387 struct gap_stats {
388 daddr_t start; /* current gap start, or SWAPBLK_NONE */
389 daddr_t num; /* number of gaps observed */
390 daddr_t max; /* largest gap size */
391 daddr_t avg; /* average gap size */
392 daddr_t err; /* sum - num * avg */
393 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
394 int max_bucket; /* last histo elt with nonzero val */
395 };
396
397 /*
398 * gap_stats_counting() - is the state 'counting 1 bits'?
399 * or 'skipping 0 bits'?
400 */
401 static inline bool
402 gap_stats_counting(const struct gap_stats *stats)
403 {
404
405 return (stats->start != SWAPBLK_NONE);
406 }
407
408 /*
409 * init_gap_stats() - initialize stats on gap sizes
410 */
411 static inline void
412 init_gap_stats(struct gap_stats *stats)
413 {
414
415 bzero(stats, sizeof(*stats));
416 stats->start = SWAPBLK_NONE;
417 }
418
419 /*
420 * update_gap_stats() - update stats on gap sizes
421 */
422 static void
423 update_gap_stats(struct gap_stats *stats, daddr_t posn)
424 {
425 daddr_t size;
426 int hi, lo, mid;
427
428 if (!gap_stats_counting(stats)) {
429 stats->start = posn;
430 return;
431 }
432 size = posn - stats->start;
433 stats->start = SWAPBLK_NONE;
434 if (size > stats->max)
435 stats->max = size;
436
437 /*
438 * Find the fibonacci range that contains size,
439 * expecting to find it in an early range.
440 */
441 lo = 0;
442 hi = 1;
443 while (hi < nitems(fib) && fib[hi] <= size) {
444 lo = hi;
445 hi *= 2;
446 }
447 if (hi >= nitems(fib))
448 hi = nitems(fib);
449 while (lo + 1 != hi) {
450 mid = (lo + hi) >> 1;
451 if (fib[mid] <= size)
452 lo = mid;
453 else
454 hi = mid;
455 }
456 stats->histo[lo]++;
457 if (lo > stats->max_bucket)
458 stats->max_bucket = lo;
459 stats->err += size - stats->avg;
460 stats->num++;
461 stats->avg += stats->err / stats->num;
462 stats->err %= stats->num;
463 }
464
465 /*
466 * dump_gap_stats() - print stats on gap sizes
467 */
468 static inline void
469 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
470 {
471 int i;
472
473 sbuf_printf(s, "number of maximal free ranges: %jd\n",
474 (intmax_t)stats->num);
475 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
476 sbuf_printf(s, "average maximal free range size: %jd\n",
477 (intmax_t)stats->avg);
478 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
479 sbuf_printf(s, " count | size range\n");
480 sbuf_printf(s, " ----- | ----------\n");
481 for (i = 0; i < stats->max_bucket; i++) {
482 if (stats->histo[i] != 0) {
483 sbuf_printf(s, "%20jd | ",
484 (intmax_t)stats->histo[i]);
485 if (fib[i] != fib[i + 1] - 1)
486 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
487 (intmax_t)fib[i + 1] - 1);
488 else
489 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
490 }
491 }
492 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
493 if (stats->histo[i] > 1)
494 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
495 (intmax_t)stats->max);
496 else
497 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
498 }
499
500 /*
501 * blist_stats() - dump radix tree stats
502 */
503 void
504 blist_stats(blist_t bl, struct sbuf *s)
505 {
506 struct gap_stats gstats;
507 struct gap_stats *stats = &gstats;
508 daddr_t i, nodes, radix;
509 u_daddr_t bit, diff, mask;
510
511 init_gap_stats(stats);
512 nodes = 0;
513 i = bl->bl_radix;
514 while (i < bl->bl_radix + bl->bl_blocks) {
515 /*
516 * Find max size subtree starting at i.
517 */
518 radix = BLIST_BMAP_RADIX;
519 while (((i / radix) & BLIST_META_MASK) == 0)
520 radix *= BLIST_META_RADIX;
521
522 /*
523 * Check for skippable subtrees starting at i.
524 */
525 while (radix > BLIST_BMAP_RADIX) {
526 if (bl->bl_root[nodes].u.bmu_avail == 0) {
527 if (gap_stats_counting(stats))
528 update_gap_stats(stats, i);
529 break;
530 }
531 if (bl->bl_root[nodes].u.bmu_avail == radix) {
532 if (!gap_stats_counting(stats))
533 update_gap_stats(stats, i);
534 break;
535 }
536
537 /*
538 * Skip subtree root.
539 */
540 nodes++;
541 radix /= BLIST_META_RADIX;
542 }
543 if (radix == BLIST_BMAP_RADIX) {
544 /*
545 * Scan leaf.
546 */
547 mask = bl->bl_root[nodes].u.bmu_bitmap;
548 diff = mask ^ (mask << 1);
549 if (gap_stats_counting(stats))
550 diff ^= 1;
551 while (diff != 0) {
552 bit = diff & -diff;
553 update_gap_stats(stats, i + bitpos(bit));
554 diff ^= bit;
555 }
556 }
557 nodes += radix_to_skip(radix);
558 i += radix;
559 }
560 update_gap_stats(stats, i);
561 dump_gap_stats(stats, s);
562 }
563
564 /************************************************************************
565 * ALLOCATION SUPPORT FUNCTIONS *
566 ************************************************************************
567 *
568 * These support functions do all the actual work. They may seem
569 * rather longish, but that's because I've commented them up. The
570 * actual code is straight forward.
571 *
572 */
573
574 /*
575 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
576 *
577 * This is the core of the allocator and is optimized for the
578 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
579 * time is proportional to log2(count) + bitpos time.
580 */
581 static daddr_t
582 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
583 {
584 u_daddr_t mask;
585 int count1, hi, lo, num_shifts, range1, range_ext;
586
587 range1 = 0;
588 count1 = count - 1;
589 num_shifts = fls(count1);
590 mask = scan->u.bmu_bitmap;
591 while ((-mask & ~mask) != 0 && num_shifts > 0) {
592 /*
593 * If bit i is set in mask, then bits in [i, i+range1] are set
594 * in scan->u.bmu_bitmap. The value of range1 is equal to
595 * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
596 * while preserving these invariants. The updates to mask leave
597 * fewer bits set, but each bit that remains set represents a
598 * longer string of consecutive bits set in scan->u.bmu_bitmap.
599 * If more updates to mask cannot clear more bits, because mask
600 * is partitioned with all 0 bits preceding all 1 bits, the loop
601 * terminates immediately.
602 */
603 num_shifts--;
604 range_ext = range1 + ((count1 >> num_shifts) & 1);
605 /*
606 * mask is a signed quantity for the shift because when it is
607 * shifted right, the sign bit should copied; when the last
608 * block of the leaf is free, pretend, for a while, that all the
609 * blocks that follow it are also free.
610 */
611 mask &= (daddr_t)mask >> range_ext;
612 range1 += range_ext;
613 }
614 if (mask == 0) {
615 /*
616 * Update bighint. There is no allocation bigger than range1
617 * starting in this leaf.
618 */
619 scan->bm_bighint = range1;
620 return (SWAPBLK_NONE);
621 }
622
623 /* Discard any candidates that appear before blk. */
624 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
625 if (mask == 0)
626 return (SWAPBLK_NONE);
627
628 /*
629 * The least significant set bit in mask marks the start of the first
630 * available range of sufficient size. Clear all the bits but that one,
631 * and then find its position.
632 */
633 mask &= -mask;
634 lo = bitpos(mask);
635
636 hi = lo + count;
637 if (hi > BLIST_BMAP_RADIX) {
638 /*
639 * An allocation within this leaf is impossible, so a successful
640 * allocation depends on the next leaf providing some of the blocks.
641 */
642 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
643 /*
644 * The next leaf has a different meta-node parent, so it
645 * is not necessarily initialized. Update bighint,
646 * comparing the range found at the end of mask to the
647 * largest earlier range that could have been made to
648 * vanish in the initial processing of mask.
649 */
650 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
651 return (SWAPBLK_NONE);
652 }
653 hi -= BLIST_BMAP_RADIX;
654 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
655 /*
656 * The next leaf doesn't have enough free blocks at the
657 * beginning to complete the spanning allocation. The
658 * hint cannot be updated, because the same allocation
659 * request could be satisfied later, by this leaf, if
660 * the state of the next leaf changes, and without any
661 * changes to this leaf.
662 */
663 return (SWAPBLK_NONE);
664 }
665 /* Clear the first 'hi' bits in the next leaf, allocating them. */
666 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
667 hi = BLIST_BMAP_RADIX;
668 }
669
670 /* Set the bits of mask at position 'lo' and higher. */
671 mask = -mask;
672 if (hi == BLIST_BMAP_RADIX) {
673 /*
674 * Update bighint. There is no allocation bigger than range1
675 * available in this leaf after this allocation completes.
676 */
677 scan->bm_bighint = range1;
678 } else {
679 /* Clear the bits of mask at position 'hi' and higher. */
680 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
681 /* If this allocation uses all the bits, clear the hint. */
682 if (mask == scan->u.bmu_bitmap)
683 scan->bm_bighint = 0;
684 }
685 /* Clear the allocated bits from this leaf. */
686 scan->u.bmu_bitmap &= ~mask;
687 return ((blk & ~BLIST_BMAP_MASK) + lo);
688 }
689
690 /*
691 * blist_meta_alloc() - allocate at a meta in the radix tree.
692 *
693 * Attempt to allocate at a meta node. If we can't, we update
694 * bighint and return a failure. Updating bighint optimize future
695 * calls that hit this node. We have to check for our collapse cases
696 * and we have a few optimizations strewn in as well.
697 */
698 static daddr_t
699 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
700 {
701 daddr_t blk, i, next_skip, r, skip;
702 int child;
703 bool scan_from_start;
704
705 if (radix == BLIST_BMAP_RADIX)
706 return (blst_leaf_alloc(scan, cursor, count));
707 if (scan->u.bmu_avail < count) {
708 /*
709 * The meta node's hint must be too large if the allocation
710 * exceeds the number of free blocks. Reduce the hint, and
711 * return failure.
712 */
713 scan->bm_bighint = scan->u.bmu_avail;
714 return (SWAPBLK_NONE);
715 }
716 blk = cursor & -radix;
717 skip = radix_to_skip(radix);
718 next_skip = skip / BLIST_META_RADIX;
719
720 /*
721 * An ALL-FREE meta node requires special handling before allocating
722 * any of its blocks.
723 */
724 if (scan->u.bmu_avail == radix) {
725 radix /= BLIST_META_RADIX;
726
727 /*
728 * Reinitialize each of the meta node's children. An ALL-FREE
729 * meta node cannot have a terminator in any subtree.
730 */
731 for (i = 1; i < skip; i += next_skip) {
732 if (next_skip == 1)
733 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
734 else
735 scan[i].u.bmu_avail = radix;
736 scan[i].bm_bighint = radix;
737 }
738 } else {
739 radix /= BLIST_META_RADIX;
740 }
741
742 if (count > radix) {
743 /*
744 * The allocation exceeds the number of blocks that are
745 * managed by a subtree of this meta node.
746 */
747 panic("allocation too large");
748 }
749 scan_from_start = cursor == blk;
750 child = (cursor - blk) / radix;
751 blk += child * radix;
752 for (i = 1 + child * next_skip; i < skip; i += next_skip) {
753 if (count <= scan[i].bm_bighint) {
754 /*
755 * The allocation might fit beginning in the i'th subtree.
756 */
757 r = blst_meta_alloc(&scan[i],
758 cursor > blk ? cursor : blk, count, radix);
759 if (r != SWAPBLK_NONE) {
760 scan->u.bmu_avail -= count;
761 return (r);
762 }
763 } else if (scan[i].bm_bighint == (daddr_t)-1) {
764 /*
765 * Terminator
766 */
767 break;
768 }
769 blk += radix;
770 }
771
772 /*
773 * We couldn't allocate count in this subtree, update bighint.
774 */
775 if (scan_from_start && scan->bm_bighint >= count)
776 scan->bm_bighint = count - 1;
777
778 return (SWAPBLK_NONE);
779 }
780
781 /*
782 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
783 *
784 */
785 static void
786 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
787 {
788 u_daddr_t mask;
789 int n;
790
791 /*
792 * free some data in this bitmap
793 * mask=0000111111111110000
794 * \_________/\__/
795 * count n
796 */
797 n = blk & BLIST_BMAP_MASK;
798 mask = ((u_daddr_t)-1 << n) &
799 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
800 if (scan->u.bmu_bitmap & mask)
801 panic("freeing free block");
802 scan->u.bmu_bitmap |= mask;
803
804 /*
805 * We could probably do a better job here. We are required to make
806 * bighint at least as large as the biggest contiguous block of
807 * data. If we just shoehorn it, a little extra overhead will
808 * be incured on the next allocation (but only that one typically).
809 */
810 scan->bm_bighint = BLIST_BMAP_RADIX;
811 }
812
813 /*
814 * BLST_META_FREE() - free allocated blocks from radix tree meta info
815 *
816 * This support routine frees a range of blocks from the bitmap.
817 * The range must be entirely enclosed by this radix node. If a
818 * meta node, we break the range down recursively to free blocks
819 * in subnodes (which means that this code can free an arbitrary
820 * range whereas the allocation code cannot allocate an arbitrary
821 * range).
822 */
823 static void
824 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
825 {
826 daddr_t blk, i, next_skip, skip, v;
827 int child;
828
829 if (scan->bm_bighint == (daddr_t)-1)
830 panic("freeing invalid range");
831 if (radix == BLIST_BMAP_RADIX)
832 return (blst_leaf_free(scan, freeBlk, count));
833 skip = radix_to_skip(radix);
834 next_skip = skip / BLIST_META_RADIX;
835
836 if (scan->u.bmu_avail == 0) {
837 /*
838 * ALL-ALLOCATED special case, with possible
839 * shortcut to ALL-FREE special case.
840 */
841 scan->u.bmu_avail = count;
842 scan->bm_bighint = count;
843
844 if (count != radix) {
845 for (i = 1; i < skip; i += next_skip) {
846 if (scan[i].bm_bighint == (daddr_t)-1)
847 break;
848 scan[i].bm_bighint = 0;
849 if (next_skip == 1) {
850 scan[i].u.bmu_bitmap = 0;
851 } else {
852 scan[i].u.bmu_avail = 0;
853 }
854 }
855 /* fall through */
856 }
857 } else {
858 scan->u.bmu_avail += count;
859 /* scan->bm_bighint = radix; */
860 }
861
862 /*
863 * ALL-FREE special case.
864 */
865
866 if (scan->u.bmu_avail == radix)
867 return;
868 if (scan->u.bmu_avail > radix)
869 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
870 (long long)count, (long long)scan->u.bmu_avail,
871 (long long)radix);
872
873 /*
874 * Break the free down into its components
875 */
876
877 blk = freeBlk & -radix;
878 radix /= BLIST_META_RADIX;
879
880 child = (freeBlk - blk) / radix;
881 blk += child * radix;
882 i = 1 + child * next_skip;
883 while (i < skip && blk < freeBlk + count) {
884 v = blk + radix - freeBlk;
885 if (v > count)
886 v = count;
887 blst_meta_free(&scan[i], freeBlk, v, radix);
888 if (scan->bm_bighint < scan[i].bm_bighint)
889 scan->bm_bighint = scan[i].bm_bighint;
890 count -= v;
891 freeBlk += v;
892 blk += radix;
893 i += next_skip;
894 }
895 }
896
897 /*
898 * BLIST_RADIX_COPY() - copy one radix tree to another
899 *
900 * Locates free space in the source tree and frees it in the destination
901 * tree. The space may not already be free in the destination.
902 */
903 static void
904 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
905 daddr_t count)
906 {
907 daddr_t i, next_skip, skip;
908
909 /*
910 * Leaf node
911 */
912
913 if (radix == BLIST_BMAP_RADIX) {
914 u_daddr_t v = scan->u.bmu_bitmap;
915
916 if (v == (u_daddr_t)-1) {
917 blist_free(dest, blk, count);
918 } else if (v != 0) {
919 int i;
920
921 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
922 if (v & ((u_daddr_t)1 << i))
923 blist_free(dest, blk + i, 1);
924 }
925 }
926 return;
927 }
928
929 /*
930 * Meta node
931 */
932
933 if (scan->u.bmu_avail == 0) {
934 /*
935 * Source all allocated, leave dest allocated
936 */
937 return;
938 }
939 if (scan->u.bmu_avail == radix) {
940 /*
941 * Source all free, free entire dest
942 */
943 if (count < radix)
944 blist_free(dest, blk, count);
945 else
946 blist_free(dest, blk, radix);
947 return;
948 }
949
950
951 skip = radix_to_skip(radix);
952 next_skip = skip / BLIST_META_RADIX;
953 radix /= BLIST_META_RADIX;
954
955 for (i = 1; count && i < skip; i += next_skip) {
956 if (scan[i].bm_bighint == (daddr_t)-1)
957 break;
958
959 if (count >= radix) {
960 blst_copy(&scan[i], blk, radix, dest, radix);
961 count -= radix;
962 } else {
963 if (count) {
964 blst_copy(&scan[i], blk, radix, dest, count);
965 }
966 count = 0;
967 }
968 blk += radix;
969 }
970 }
971
972 /*
973 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
974 *
975 * This routine allocates all blocks in the specified range
976 * regardless of any existing allocations in that range. Returns
977 * the number of blocks allocated by the call.
978 */
979 static daddr_t
980 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
981 {
982 daddr_t nblks;
983 u_daddr_t mask;
984 int n;
985
986 n = blk & BLIST_BMAP_MASK;
987 mask = ((u_daddr_t)-1 << n) &
988 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
989
990 /* Count the number of blocks that we are allocating. */
991 nblks = bitcount64(scan->u.bmu_bitmap & mask);
992
993 scan->u.bmu_bitmap &= ~mask;
994 return (nblks);
995 }
996
997 /*
998 * BLIST_META_FILL() - allocate specific blocks at a meta node
999 *
1000 * This routine allocates the specified range of blocks,
1001 * regardless of any existing allocations in the range. The
1002 * range must be within the extent of this node. Returns the
1003 * number of blocks allocated by the call.
1004 */
1005 static daddr_t
1006 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1007 {
1008 daddr_t blk, i, nblks, next_skip, skip, v;
1009 int child;
1010
1011 if (scan->bm_bighint == (daddr_t)-1)
1012 panic("filling invalid range");
1013 if (count > radix) {
1014 /*
1015 * The allocation exceeds the number of blocks that are
1016 * managed by this node.
1017 */
1018 panic("fill too large");
1019 }
1020 if (radix == BLIST_BMAP_RADIX)
1021 return (blst_leaf_fill(scan, allocBlk, count));
1022 if (count == radix || scan->u.bmu_avail == 0) {
1023 /*
1024 * ALL-ALLOCATED special case
1025 */
1026 nblks = scan->u.bmu_avail;
1027 scan->u.bmu_avail = 0;
1028 scan->bm_bighint = 0;
1029 return (nblks);
1030 }
1031 skip = radix_to_skip(radix);
1032 next_skip = skip / BLIST_META_RADIX;
1033 blk = allocBlk & -radix;
1034
1035 /*
1036 * An ALL-FREE meta node requires special handling before allocating
1037 * any of its blocks.
1038 */
1039 if (scan->u.bmu_avail == radix) {
1040 radix /= BLIST_META_RADIX;
1041
1042 /*
1043 * Reinitialize each of the meta node's children. An ALL-FREE
1044 * meta node cannot have a terminator in any subtree.
1045 */
1046 for (i = 1; i < skip; i += next_skip) {
1047 if (next_skip == 1)
1048 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1049 else
1050 scan[i].u.bmu_avail = radix;
1051 scan[i].bm_bighint = radix;
1052 }
1053 } else {
1054 radix /= BLIST_META_RADIX;
1055 }
1056
1057 nblks = 0;
1058 child = (allocBlk - blk) / radix;
1059 blk += child * radix;
1060 i = 1 + child * next_skip;
1061 while (i < skip && blk < allocBlk + count) {
1062 v = blk + radix - allocBlk;
1063 if (v > count)
1064 v = count;
1065 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1066 count -= v;
1067 allocBlk += v;
1068 blk += radix;
1069 i += next_skip;
1070 }
1071 scan->u.bmu_avail -= nblks;
1072 return (nblks);
1073 }
1074
1075 /*
1076 * BLST_RADIX_INIT() - initialize radix tree
1077 *
1078 * Initialize our meta structures and bitmaps and calculate the exact
1079 * amount of space required to manage 'count' blocks - this space may
1080 * be considerably less than the calculated radix due to the large
1081 * RADIX values we use.
1082 */
1083 static daddr_t
1084 blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1085 {
1086 daddr_t i, memindex, next_skip, skip;
1087
1088 memindex = 0;
1089
1090 /*
1091 * Leaf node
1092 */
1093
1094 if (radix == BLIST_BMAP_RADIX) {
1095 if (scan) {
1096 scan->bm_bighint = 0;
1097 scan->u.bmu_bitmap = 0;
1098 }
1099 return (memindex);
1100 }
1101
1102 /*
1103 * Meta node. If allocating the entire object we can special
1104 * case it. However, we need to figure out how much memory
1105 * is required to manage 'count' blocks, so we continue on anyway.
1106 */
1107
1108 if (scan) {
1109 scan->bm_bighint = 0;
1110 scan->u.bmu_avail = 0;
1111 }
1112
1113 skip = radix_to_skip(radix);
1114 next_skip = skip / BLIST_META_RADIX;
1115 radix /= BLIST_META_RADIX;
1116
1117 for (i = 1; i < skip; i += next_skip) {
1118 if (count >= radix) {
1119 /*
1120 * Allocate the entire object
1121 */
1122 memindex = i +
1123 blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1124 radix);
1125 count -= radix;
1126 } else if (count > 0) {
1127 /*
1128 * Allocate a partial object
1129 */
1130 memindex = i +
1131 blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1132 count);
1133 count = 0;
1134 } else {
1135 /*
1136 * Add terminator and break out. Make terminator bitmap
1137 * zero to avoid a spanning leaf allocation that
1138 * includes the terminator.
1139 */
1140 if (scan) {
1141 scan[i].bm_bighint = (daddr_t)-1;
1142 scan[i].u.bmu_bitmap = 0;
1143 }
1144 break;
1145 }
1146 }
1147 if (memindex < i)
1148 memindex = i;
1149 return (memindex);
1150 }
1151
1152 #ifdef BLIST_DEBUG
1153
1154 static void
1155 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1156 {
1157 daddr_t i, next_skip, skip;
1158
1159 if (radix == BLIST_BMAP_RADIX) {
1160 printf(
1161 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1162 tab, tab, "",
1163 (long long)blk, (long long)radix,
1164 (long long)scan->u.bmu_bitmap,
1165 (long long)scan->bm_bighint
1166 );
1167 return;
1168 }
1169
1170 if (scan->u.bmu_avail == 0) {
1171 printf(
1172 "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1173 tab, tab, "",
1174 (long long)blk,
1175 (long long)radix
1176 );
1177 return;
1178 }
1179 if (scan->u.bmu_avail == radix) {
1180 printf(
1181 "%*.*s(%08llx,%lld) ALL FREE\n",
1182 tab, tab, "",
1183 (long long)blk,
1184 (long long)radix
1185 );
1186 return;
1187 }
1188
1189 printf(
1190 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1191 tab, tab, "",
1192 (long long)blk, (long long)radix,
1193 (long long)scan->u.bmu_avail,
1194 (long long)radix,
1195 (long long)scan->bm_bighint
1196 );
1197
1198 skip = radix_to_skip(radix);
1199 next_skip = skip / BLIST_META_RADIX;
1200 radix /= BLIST_META_RADIX;
1201 tab += 4;
1202
1203 for (i = 1; i < skip; i += next_skip) {
1204 if (scan[i].bm_bighint == (daddr_t)-1) {
1205 printf(
1206 "%*.*s(%08llx,%lld): Terminator\n",
1207 tab, tab, "",
1208 (long long)blk, (long long)radix
1209 );
1210 break;
1211 }
1212 blst_radix_print(&scan[i], blk, radix, tab);
1213 blk += radix;
1214 }
1215 tab -= 4;
1216
1217 printf(
1218 "%*.*s}\n",
1219 tab, tab, ""
1220 );
1221 }
1222
1223 #endif
1224
1225 #ifdef BLIST_DEBUG
1226
1227 int
1228 main(int ac, char **av)
1229 {
1230 int size = 1024;
1231 int i;
1232 blist_t bl;
1233 struct sbuf *s;
1234
1235 for (i = 1; i < ac; ++i) {
1236 const char *ptr = av[i];
1237 if (*ptr != '-') {
1238 size = strtol(ptr, NULL, 0);
1239 continue;
1240 }
1241 ptr += 2;
1242 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1243 exit(1);
1244 }
1245 bl = blist_create(size, M_WAITOK);
1246 blist_free(bl, 0, size);
1247
1248 for (;;) {
1249 char buf[1024];
1250 long long da = 0;
1251 long long count = 0;
1252
1253 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1254 (long long)size, (long long)bl->bl_radix);
1255 fflush(stdout);
1256 if (fgets(buf, sizeof(buf), stdin) == NULL)
1257 break;
1258 switch(buf[0]) {
1259 case 'r':
1260 if (sscanf(buf + 1, "%lld", &count) == 1) {
1261 blist_resize(&bl, count, 1, M_WAITOK);
1262 } else {
1263 printf("?\n");
1264 }
1265 case 'p':
1266 blist_print(bl);
1267 break;
1268 case 's':
1269 s = sbuf_new_auto();
1270 blist_stats(bl, s);
1271 sbuf_finish(s);
1272 printf("%s", sbuf_data(s));
1273 sbuf_delete(s);
1274 break;
1275 case 'a':
1276 if (sscanf(buf + 1, "%lld", &count) == 1) {
1277 daddr_t blk = blist_alloc(bl, count);
1278 printf(" R=%08llx\n", (long long)blk);
1279 } else {
1280 printf("?\n");
1281 }
1282 break;
1283 case 'f':
1284 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1285 blist_free(bl, da, count);
1286 } else {
1287 printf("?\n");
1288 }
1289 break;
1290 case 'l':
1291 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1292 printf(" n=%jd\n",
1293 (intmax_t)blist_fill(bl, da, count));
1294 } else {
1295 printf("?\n");
1296 }
1297 break;
1298 case '?':
1299 case 'h':
1300 puts(
1301 "p -print\n"
1302 "s -stats\n"
1303 "a %d -allocate\n"
1304 "f %x %d -free\n"
1305 "l %x %d -fill\n"
1306 "r %d -resize\n"
1307 "h/? -help"
1308 );
1309 break;
1310 default:
1311 printf("?\n");
1312 break;
1313 }
1314 }
1315 return(0);
1316 }
1317
1318 void
1319 panic(const char *ctl, ...)
1320 {
1321 va_list va;
1322
1323 va_start(va, ctl);
1324 vfprintf(stderr, ctl, va);
1325 fprintf(stderr, "\n");
1326 va_end(va);
1327 exit(1);
1328 }
1329
1330 #endif
Cache object: 3fbcfcf81bd961062d97be9341081b34
|