FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_blist.c
1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 /*
30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 * This module implements a general bitmap allocator/deallocator. The
33 * allocator eats around 2 bits per 'block'. The module does not
34 * try to interpret the meaning of a 'block' other than to return
35 * SWAPBLK_NONE on an allocation failure.
36 *
37 * A radix tree controls access to pieces of the bitmap, and includes
38 * auxiliary information at each interior node about the availabilty of
39 * contiguous free blocks in the subtree rooted at that node. Two radix
40 * constants are involved: one for the size of the bitmaps contained in the
41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
43 * associated with a range of blocks. The root of any subtree stores a
44 * hint field that defines an upper bound on the size of the largest
45 * allocation that can begin in the associated block range. A hint is an
46 * upper bound on a potential allocation, but not necessarily a tight upper
47 * bound.
48 *
49 * The radix tree also implements two collapsed states for meta nodes:
50 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
51 * in either of these two states, all information contained underneath
52 * the node is considered stale. These states are used to optimize
53 * allocation and freeing operations.
54 *
55 * The hinting greatly increases code efficiency for allocations while
56 * the general radix structure optimizes both allocations and frees. The
57 * radix tree should be able to operate well no matter how much
58 * fragmentation there is and no matter how large a bitmap is used.
59 *
60 * The blist code wires all necessary memory at creation time. Neither
61 * allocations nor frees require interaction with the memory subsystem.
62 * The non-blocking features of the blist code are used in the swap code
63 * (vm/swap_pager.c).
64 *
65 * LAYOUT: The radix tree is laid out recursively using a
66 * linear array. Each meta node is immediately followed (laid out
67 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This
68 * is a recursive structure but one that can be easily scanned through
69 * a very simple 'skip' calculation. In order to support large radixes,
70 * portions of the tree may reside outside our memory allocation. We
71 * handle this with an early-termination optimization (when bighint is
72 * set to -1) on the scan. The memory allocation is only large enough
73 * to cover the number of blocks requested at creation time even if it
74 * must be encompassed in larger root-node radix.
75 *
76 * NOTE: the allocator cannot currently allocate more than
77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
78 * large' if you try. This is an area that could use improvement. The
79 * radix is large enough that this restriction does not effect the swap
80 * system, though. Currently only the allocation code is affected by
81 * this algorithmic unfeature. The freeing code can handle arbitrary
82 * ranges.
83 *
84 * This code can be compiled stand-alone for debugging.
85 */
86
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD: releng/12.0/sys/kern/subr_blist.c 338472 2018-09-05 19:05:30Z markj $");
89
90 #ifdef _KERNEL
91
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101
102 #else
103
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107
108 #include <sys/types.h>
109 #include <sys/malloc.h>
110 #include <sys/sbuf.h>
111 #include <stdio.h>
112 #include <string.h>
113 #include <stddef.h>
114 #include <stdlib.h>
115 #include <stdarg.h>
116 #include <stdbool.h>
117
118 #define bitcount64(x) __bitcount64((uint64_t)(x))
119 #define malloc(a,b,c) calloc(a, 1)
120 #define free(a,b) free(a)
121 static __inline int imax(int a, int b) { return (a > b ? a : b); }
122
123 #include <sys/blist.h>
124
125 void panic(const char *ctl, ...);
126
127 #endif
128
129 /*
130 * static support functions
131 */
132 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
133 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
134 u_daddr_t radix);
135 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
136 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
137 u_daddr_t radix);
138 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
139 blist_t dest, daddr_t count);
140 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
141 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
142 u_daddr_t radix);
143 #ifndef _KERNEL
144 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
145 int tab);
146 #endif
147
148 #ifdef _KERNEL
149 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
150 #endif
151
152 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
153 "radix divisibility error");
154 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
155 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
156
157 /*
158 * For a subtree that can represent the state of up to 'radix' blocks, the
159 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
160 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
161 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
162 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
163 * in the 'meta' functions that process subtrees. Since integer division
164 * discards remainders, we can express this computation as
165 * skip = (m * m**h) / (m - 1)
166 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
167 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
168 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
169 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
170 * so that simple integer division by a constant can safely be used for the
171 * calculation.
172 */
173 static inline daddr_t
174 radix_to_skip(daddr_t radix)
175 {
176
177 return (radix /
178 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
179 }
180
181 /*
182 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
183 * Assumes that the argument has only one bit set.
184 */
185 static inline int
186 bitpos(u_daddr_t mask)
187 {
188 int hi, lo, mid;
189
190 switch (sizeof(mask)) {
191 #ifdef HAVE_INLINE_FFSLL
192 case sizeof(long long):
193 return (ffsll(mask) - 1);
194 #endif
195 default:
196 lo = 0;
197 hi = BLIST_BMAP_RADIX;
198 while (lo + 1 < hi) {
199 mid = (lo + hi) >> 1;
200 if ((mask >> mid) != 0)
201 lo = mid;
202 else
203 hi = mid;
204 }
205 return (lo);
206 }
207 }
208
209 /*
210 * blist_create() - create a blist capable of handling up to the specified
211 * number of blocks
212 *
213 * blocks - must be greater than 0
214 * flags - malloc flags
215 *
216 * The smallest blist consists of a single leaf node capable of
217 * managing BLIST_BMAP_RADIX blocks.
218 */
219 blist_t
220 blist_create(daddr_t blocks, int flags)
221 {
222 blist_t bl;
223 daddr_t i, last_block;
224 u_daddr_t nodes, radix, skip;
225 int digit;
226
227 if (blocks == 0)
228 panic("invalid block count");
229
230 /*
231 * Calculate the radix and node count used for scanning.
232 */
233 last_block = blocks - 1;
234 radix = BLIST_BMAP_RADIX;
235 while (radix < blocks) {
236 if (((last_block / radix + 1) & BLIST_META_MASK) != 0)
237 /*
238 * We must widen the blist to avoid partially
239 * filled nodes.
240 */
241 last_block |= radix - 1;
242 radix *= BLIST_META_RADIX;
243 }
244
245 /*
246 * Count the meta-nodes in the expanded tree, including the final
247 * terminator, from the bottom level up to the root.
248 */
249 nodes = 1;
250 if (radix - blocks >= BLIST_BMAP_RADIX)
251 nodes++;
252 last_block /= BLIST_BMAP_RADIX;
253 while (last_block > 0) {
254 nodes += last_block + 1;
255 last_block /= BLIST_META_RADIX;
256 }
257 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
258 M_ZERO);
259 if (bl == NULL)
260 return (NULL);
261
262 bl->bl_blocks = blocks;
263 bl->bl_radix = radix;
264 bl->bl_cursor = 0;
265
266 /*
267 * Initialize the empty tree by filling in root values, then initialize
268 * just the terminators in the rest of the tree.
269 */
270 bl->bl_root[0].bm_bighint = 0;
271 if (radix == BLIST_BMAP_RADIX)
272 bl->bl_root[0].u.bmu_bitmap = 0;
273 else
274 bl->bl_root[0].u.bmu_avail = 0;
275 last_block = blocks - 1;
276 i = 0;
277 while (radix > BLIST_BMAP_RADIX) {
278 radix /= BLIST_META_RADIX;
279 skip = radix_to_skip(radix);
280 digit = last_block / radix;
281 i += 1 + digit * skip;
282 if (digit != BLIST_META_MASK) {
283 /*
284 * Add a terminator.
285 */
286 bl->bl_root[i + skip].bm_bighint = (daddr_t)-1;
287 bl->bl_root[i + skip].u.bmu_bitmap = 0;
288 }
289 last_block %= radix;
290 }
291
292 #if defined(BLIST_DEBUG)
293 printf(
294 "BLIST representing %lld blocks (%lld MB of swap)"
295 ", requiring %lldK of ram\n",
296 (long long)bl->bl_blocks,
297 (long long)bl->bl_blocks * 4 / 1024,
298 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
299 );
300 printf("BLIST raw radix tree contains %lld records\n",
301 (long long)nodes);
302 #endif
303
304 return (bl);
305 }
306
307 void
308 blist_destroy(blist_t bl)
309 {
310
311 free(bl, M_SWAP);
312 }
313
314 /*
315 * blist_alloc() - reserve space in the block bitmap. Return the base
316 * of a contiguous region or SWAPBLK_NONE if space could
317 * not be allocated.
318 */
319 daddr_t
320 blist_alloc(blist_t bl, daddr_t count)
321 {
322 daddr_t blk;
323
324 /*
325 * This loop iterates at most twice. An allocation failure in the
326 * first iteration leads to a second iteration only if the cursor was
327 * non-zero. When the cursor is zero, an allocation failure will
328 * reduce the hint, stopping further iterations.
329 */
330 while (count <= bl->bl_root->bm_bighint) {
331 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
332 bl->bl_radix);
333 if (blk != SWAPBLK_NONE) {
334 bl->bl_cursor = blk + count;
335 if (bl->bl_cursor == bl->bl_blocks)
336 bl->bl_cursor = 0;
337 return (blk);
338 } else if (bl->bl_cursor != 0)
339 bl->bl_cursor = 0;
340 }
341 return (SWAPBLK_NONE);
342 }
343
344 /*
345 * blist_avail() - return the number of free blocks.
346 */
347 daddr_t
348 blist_avail(blist_t bl)
349 {
350
351 if (bl->bl_radix == BLIST_BMAP_RADIX)
352 return (bitcount64(bl->bl_root->u.bmu_bitmap));
353 else
354 return (bl->bl_root->u.bmu_avail);
355 }
356
357 /*
358 * blist_free() - free up space in the block bitmap. Return the base
359 * of a contiguous region. Panic if an inconsistancy is
360 * found.
361 */
362 void
363 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
364 {
365
366 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
367 }
368
369 /*
370 * blist_fill() - mark a region in the block bitmap as off-limits
371 * to the allocator (i.e. allocate it), ignoring any
372 * existing allocations. Return the number of blocks
373 * actually filled that were free before the call.
374 */
375 daddr_t
376 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
377 {
378
379 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
380 }
381
382 /*
383 * blist_resize() - resize an existing radix tree to handle the
384 * specified number of blocks. This will reallocate
385 * the tree and transfer the previous bitmap to the new
386 * one. When extending the tree you can specify whether
387 * the new blocks are to left allocated or freed.
388 */
389 void
390 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
391 {
392 blist_t newbl = blist_create(count, flags);
393 blist_t save = *pbl;
394
395 *pbl = newbl;
396 if (count > save->bl_blocks)
397 count = save->bl_blocks;
398 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
399
400 /*
401 * If resizing upwards, should we free the new space or not?
402 */
403 if (freenew && count < newbl->bl_blocks) {
404 blist_free(newbl, count, newbl->bl_blocks - count);
405 }
406 blist_destroy(save);
407 }
408
409 #ifdef BLIST_DEBUG
410
411 /*
412 * blist_print() - dump radix tree
413 */
414 void
415 blist_print(blist_t bl)
416 {
417 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
418 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
419 printf("}\n");
420 }
421
422 #endif
423
424 static const u_daddr_t fib[] = {
425 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
426 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
427 514229, 832040, 1346269, 2178309, 3524578,
428 };
429
430 /*
431 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
432 */
433 struct gap_stats {
434 daddr_t start; /* current gap start, or SWAPBLK_NONE */
435 daddr_t num; /* number of gaps observed */
436 daddr_t max; /* largest gap size */
437 daddr_t avg; /* average gap size */
438 daddr_t err; /* sum - num * avg */
439 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
440 int max_bucket; /* last histo elt with nonzero val */
441 };
442
443 /*
444 * gap_stats_counting() - is the state 'counting 1 bits'?
445 * or 'skipping 0 bits'?
446 */
447 static inline bool
448 gap_stats_counting(const struct gap_stats *stats)
449 {
450
451 return (stats->start != SWAPBLK_NONE);
452 }
453
454 /*
455 * init_gap_stats() - initialize stats on gap sizes
456 */
457 static inline void
458 init_gap_stats(struct gap_stats *stats)
459 {
460
461 bzero(stats, sizeof(*stats));
462 stats->start = SWAPBLK_NONE;
463 }
464
465 /*
466 * update_gap_stats() - update stats on gap sizes
467 */
468 static void
469 update_gap_stats(struct gap_stats *stats, daddr_t posn)
470 {
471 daddr_t size;
472 int hi, lo, mid;
473
474 if (!gap_stats_counting(stats)) {
475 stats->start = posn;
476 return;
477 }
478 size = posn - stats->start;
479 stats->start = SWAPBLK_NONE;
480 if (size > stats->max)
481 stats->max = size;
482
483 /*
484 * Find the fibonacci range that contains size,
485 * expecting to find it in an early range.
486 */
487 lo = 0;
488 hi = 1;
489 while (hi < nitems(fib) && fib[hi] <= size) {
490 lo = hi;
491 hi *= 2;
492 }
493 if (hi >= nitems(fib))
494 hi = nitems(fib);
495 while (lo + 1 != hi) {
496 mid = (lo + hi) >> 1;
497 if (fib[mid] <= size)
498 lo = mid;
499 else
500 hi = mid;
501 }
502 stats->histo[lo]++;
503 if (lo > stats->max_bucket)
504 stats->max_bucket = lo;
505 stats->err += size - stats->avg;
506 stats->num++;
507 stats->avg += stats->err / stats->num;
508 stats->err %= stats->num;
509 }
510
511 /*
512 * dump_gap_stats() - print stats on gap sizes
513 */
514 static inline void
515 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
516 {
517 int i;
518
519 sbuf_printf(s, "number of maximal free ranges: %jd\n",
520 (intmax_t)stats->num);
521 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
522 sbuf_printf(s, "average maximal free range size: %jd\n",
523 (intmax_t)stats->avg);
524 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
525 sbuf_printf(s, " count | size range\n");
526 sbuf_printf(s, " ----- | ----------\n");
527 for (i = 0; i < stats->max_bucket; i++) {
528 if (stats->histo[i] != 0) {
529 sbuf_printf(s, "%20jd | ",
530 (intmax_t)stats->histo[i]);
531 if (fib[i] != fib[i + 1] - 1)
532 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
533 (intmax_t)fib[i + 1] - 1);
534 else
535 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
536 }
537 }
538 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
539 if (stats->histo[i] > 1)
540 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
541 (intmax_t)stats->max);
542 else
543 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
544 }
545
546 /*
547 * blist_stats() - dump radix tree stats
548 */
549 void
550 blist_stats(blist_t bl, struct sbuf *s)
551 {
552 struct gap_stats gstats;
553 struct gap_stats *stats = &gstats;
554 daddr_t i, nodes, radix;
555 u_daddr_t bit, diff, mask;
556
557 init_gap_stats(stats);
558 nodes = 0;
559 i = bl->bl_radix;
560 while (i < bl->bl_radix + bl->bl_blocks) {
561 /*
562 * Find max size subtree starting at i.
563 */
564 radix = BLIST_BMAP_RADIX;
565 while (((i / radix) & BLIST_META_MASK) == 0)
566 radix *= BLIST_META_RADIX;
567
568 /*
569 * Check for skippable subtrees starting at i.
570 */
571 while (radix > BLIST_BMAP_RADIX) {
572 if (bl->bl_root[nodes].u.bmu_avail == 0) {
573 if (gap_stats_counting(stats))
574 update_gap_stats(stats, i);
575 break;
576 }
577 if (bl->bl_root[nodes].u.bmu_avail == radix) {
578 if (!gap_stats_counting(stats))
579 update_gap_stats(stats, i);
580 break;
581 }
582
583 /*
584 * Skip subtree root.
585 */
586 nodes++;
587 radix /= BLIST_META_RADIX;
588 }
589 if (radix == BLIST_BMAP_RADIX) {
590 /*
591 * Scan leaf.
592 */
593 mask = bl->bl_root[nodes].u.bmu_bitmap;
594 diff = mask ^ (mask << 1);
595 if (gap_stats_counting(stats))
596 diff ^= 1;
597 while (diff != 0) {
598 bit = diff & -diff;
599 update_gap_stats(stats, i + bitpos(bit));
600 diff ^= bit;
601 }
602 }
603 nodes += radix_to_skip(radix);
604 i += radix;
605 }
606 update_gap_stats(stats, i);
607 dump_gap_stats(stats, s);
608 }
609
610 /************************************************************************
611 * ALLOCATION SUPPORT FUNCTIONS *
612 ************************************************************************
613 *
614 * These support functions do all the actual work. They may seem
615 * rather longish, but that's because I've commented them up. The
616 * actual code is straight forward.
617 *
618 */
619
620 /*
621 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
622 *
623 * This is the core of the allocator and is optimized for the
624 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution
625 * time is proportional to log2(count) + bitpos time.
626 */
627 static daddr_t
628 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
629 {
630 u_daddr_t mask;
631 int count1, hi, lo, num_shifts, range1, range_ext;
632
633 range1 = 0;
634 count1 = count - 1;
635 num_shifts = fls(count1);
636 mask = scan->u.bmu_bitmap;
637 while ((-mask & ~mask) != 0 && num_shifts > 0) {
638 /*
639 * If bit i is set in mask, then bits in [i, i+range1] are set
640 * in scan->u.bmu_bitmap. The value of range1 is equal to
641 * count1 >> num_shifts. Grow range and reduce num_shifts to 0,
642 * while preserving these invariants. The updates to mask leave
643 * fewer bits set, but each bit that remains set represents a
644 * longer string of consecutive bits set in scan->u.bmu_bitmap.
645 * If more updates to mask cannot clear more bits, because mask
646 * is partitioned with all 0 bits preceding all 1 bits, the loop
647 * terminates immediately.
648 */
649 num_shifts--;
650 range_ext = range1 + ((count1 >> num_shifts) & 1);
651 /*
652 * mask is a signed quantity for the shift because when it is
653 * shifted right, the sign bit should copied; when the last
654 * block of the leaf is free, pretend, for a while, that all the
655 * blocks that follow it are also free.
656 */
657 mask &= (daddr_t)mask >> range_ext;
658 range1 += range_ext;
659 }
660 if (mask == 0) {
661 /*
662 * Update bighint. There is no allocation bigger than range1
663 * starting in this leaf.
664 */
665 scan->bm_bighint = range1;
666 return (SWAPBLK_NONE);
667 }
668
669 /* Discard any candidates that appear before blk. */
670 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
671 if (mask == 0)
672 return (SWAPBLK_NONE);
673
674 /*
675 * The least significant set bit in mask marks the start of the first
676 * available range of sufficient size. Clear all the bits but that one,
677 * and then find its position.
678 */
679 mask &= -mask;
680 lo = bitpos(mask);
681
682 hi = lo + count;
683 if (hi > BLIST_BMAP_RADIX) {
684 /*
685 * An allocation within this leaf is impossible, so a successful
686 * allocation depends on the next leaf providing some of the blocks.
687 */
688 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
689 /*
690 * The next leaf has a different meta-node parent, so it
691 * is not necessarily initialized. Update bighint,
692 * comparing the range found at the end of mask to the
693 * largest earlier range that could have been made to
694 * vanish in the initial processing of mask.
695 */
696 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
697 return (SWAPBLK_NONE);
698 }
699 hi -= BLIST_BMAP_RADIX;
700 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
701 /*
702 * The next leaf doesn't have enough free blocks at the
703 * beginning to complete the spanning allocation. The
704 * hint cannot be updated, because the same allocation
705 * request could be satisfied later, by this leaf, if
706 * the state of the next leaf changes, and without any
707 * changes to this leaf.
708 */
709 return (SWAPBLK_NONE);
710 }
711 /* Clear the first 'hi' bits in the next leaf, allocating them. */
712 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
713 hi = BLIST_BMAP_RADIX;
714 }
715
716 /* Set the bits of mask at position 'lo' and higher. */
717 mask = -mask;
718 if (hi == BLIST_BMAP_RADIX) {
719 /*
720 * Update bighint. There is no allocation bigger than range1
721 * available in this leaf after this allocation completes.
722 */
723 scan->bm_bighint = range1;
724 } else {
725 /* Clear the bits of mask at position 'hi' and higher. */
726 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
727 /* If this allocation uses all the bits, clear the hint. */
728 if (mask == scan->u.bmu_bitmap)
729 scan->bm_bighint = 0;
730 }
731 /* Clear the allocated bits from this leaf. */
732 scan->u.bmu_bitmap &= ~mask;
733 return ((blk & ~BLIST_BMAP_MASK) + lo);
734 }
735
736 /*
737 * blist_meta_alloc() - allocate at a meta in the radix tree.
738 *
739 * Attempt to allocate at a meta node. If we can't, we update
740 * bighint and return a failure. Updating bighint optimize future
741 * calls that hit this node. We have to check for our collapse cases
742 * and we have a few optimizations strewn in as well.
743 */
744 static daddr_t
745 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
746 {
747 daddr_t blk, i, next_skip, r, skip;
748 int child;
749 bool scan_from_start;
750
751 if (radix == BLIST_BMAP_RADIX)
752 return (blst_leaf_alloc(scan, cursor, count));
753 if (scan->u.bmu_avail < count) {
754 /*
755 * The meta node's hint must be too large if the allocation
756 * exceeds the number of free blocks. Reduce the hint, and
757 * return failure.
758 */
759 scan->bm_bighint = scan->u.bmu_avail;
760 return (SWAPBLK_NONE);
761 }
762 blk = cursor & -radix;
763 skip = radix_to_skip(radix);
764 next_skip = skip / BLIST_META_RADIX;
765
766 /*
767 * An ALL-FREE meta node requires special handling before allocating
768 * any of its blocks.
769 */
770 if (scan->u.bmu_avail == radix) {
771 radix /= BLIST_META_RADIX;
772
773 /*
774 * Reinitialize each of the meta node's children. An ALL-FREE
775 * meta node cannot have a terminator in any subtree.
776 */
777 for (i = 1; i < skip; i += next_skip) {
778 if (next_skip == 1)
779 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
780 else
781 scan[i].u.bmu_avail = radix;
782 scan[i].bm_bighint = radix;
783 }
784 } else {
785 radix /= BLIST_META_RADIX;
786 }
787
788 if (count > radix) {
789 /*
790 * The allocation exceeds the number of blocks that are
791 * managed by a subtree of this meta node.
792 */
793 panic("allocation too large");
794 }
795 scan_from_start = cursor == blk;
796 child = (cursor - blk) / radix;
797 blk += child * radix;
798 for (i = 1 + child * next_skip; i < skip; i += next_skip) {
799 if (count <= scan[i].bm_bighint) {
800 /*
801 * The allocation might fit beginning in the i'th subtree.
802 */
803 r = blst_meta_alloc(&scan[i],
804 cursor > blk ? cursor : blk, count, radix);
805 if (r != SWAPBLK_NONE) {
806 scan->u.bmu_avail -= count;
807 return (r);
808 }
809 } else if (scan[i].bm_bighint == (daddr_t)-1) {
810 /*
811 * Terminator
812 */
813 break;
814 }
815 blk += radix;
816 }
817
818 /*
819 * We couldn't allocate count in this subtree, update bighint.
820 */
821 if (scan_from_start && scan->bm_bighint >= count)
822 scan->bm_bighint = count - 1;
823
824 return (SWAPBLK_NONE);
825 }
826
827 /*
828 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
829 *
830 */
831 static void
832 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
833 {
834 u_daddr_t mask;
835 int n;
836
837 /*
838 * free some data in this bitmap
839 * mask=0000111111111110000
840 * \_________/\__/
841 * count n
842 */
843 n = blk & BLIST_BMAP_MASK;
844 mask = ((u_daddr_t)-1 << n) &
845 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
846 if (scan->u.bmu_bitmap & mask)
847 panic("freeing free block");
848 scan->u.bmu_bitmap |= mask;
849
850 /*
851 * We could probably do a better job here. We are required to make
852 * bighint at least as large as the biggest contiguous block of
853 * data. If we just shoehorn it, a little extra overhead will
854 * be incured on the next allocation (but only that one typically).
855 */
856 scan->bm_bighint = BLIST_BMAP_RADIX;
857 }
858
859 /*
860 * BLST_META_FREE() - free allocated blocks from radix tree meta info
861 *
862 * This support routine frees a range of blocks from the bitmap.
863 * The range must be entirely enclosed by this radix node. If a
864 * meta node, we break the range down recursively to free blocks
865 * in subnodes (which means that this code can free an arbitrary
866 * range whereas the allocation code cannot allocate an arbitrary
867 * range).
868 */
869 static void
870 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
871 {
872 daddr_t blk, i, next_skip, skip, v;
873 int child;
874
875 if (scan->bm_bighint == (daddr_t)-1)
876 panic("freeing invalid range");
877 if (radix == BLIST_BMAP_RADIX)
878 return (blst_leaf_free(scan, freeBlk, count));
879 skip = radix_to_skip(radix);
880 next_skip = skip / BLIST_META_RADIX;
881
882 if (scan->u.bmu_avail == 0) {
883 /*
884 * ALL-ALLOCATED special case, with possible
885 * shortcut to ALL-FREE special case.
886 */
887 scan->u.bmu_avail = count;
888 scan->bm_bighint = count;
889
890 if (count != radix) {
891 for (i = 1; i < skip; i += next_skip) {
892 if (scan[i].bm_bighint == (daddr_t)-1)
893 break;
894 scan[i].bm_bighint = 0;
895 if (next_skip == 1) {
896 scan[i].u.bmu_bitmap = 0;
897 } else {
898 scan[i].u.bmu_avail = 0;
899 }
900 }
901 /* fall through */
902 }
903 } else {
904 scan->u.bmu_avail += count;
905 /* scan->bm_bighint = radix; */
906 }
907
908 /*
909 * ALL-FREE special case.
910 */
911
912 if (scan->u.bmu_avail == radix)
913 return;
914 if (scan->u.bmu_avail > radix)
915 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
916 (long long)count, (long long)scan->u.bmu_avail,
917 (long long)radix);
918
919 /*
920 * Break the free down into its components
921 */
922
923 blk = freeBlk & -radix;
924 radix /= BLIST_META_RADIX;
925
926 child = (freeBlk - blk) / radix;
927 blk += child * radix;
928 i = 1 + child * next_skip;
929 while (i < skip && blk < freeBlk + count) {
930 v = blk + radix - freeBlk;
931 if (v > count)
932 v = count;
933 blst_meta_free(&scan[i], freeBlk, v, radix);
934 if (scan->bm_bighint < scan[i].bm_bighint)
935 scan->bm_bighint = scan[i].bm_bighint;
936 count -= v;
937 freeBlk += v;
938 blk += radix;
939 i += next_skip;
940 }
941 }
942
943 /*
944 * BLIST_RADIX_COPY() - copy one radix tree to another
945 *
946 * Locates free space in the source tree and frees it in the destination
947 * tree. The space may not already be free in the destination.
948 */
949 static void
950 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
951 daddr_t count)
952 {
953 daddr_t i, next_skip, skip;
954
955 /*
956 * Leaf node
957 */
958
959 if (radix == BLIST_BMAP_RADIX) {
960 u_daddr_t v = scan->u.bmu_bitmap;
961
962 if (v == (u_daddr_t)-1) {
963 blist_free(dest, blk, count);
964 } else if (v != 0) {
965 int i;
966
967 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
968 if (v & ((u_daddr_t)1 << i))
969 blist_free(dest, blk + i, 1);
970 }
971 }
972 return;
973 }
974
975 /*
976 * Meta node
977 */
978
979 if (scan->u.bmu_avail == 0) {
980 /*
981 * Source all allocated, leave dest allocated
982 */
983 return;
984 }
985 if (scan->u.bmu_avail == radix) {
986 /*
987 * Source all free, free entire dest
988 */
989 if (count < radix)
990 blist_free(dest, blk, count);
991 else
992 blist_free(dest, blk, radix);
993 return;
994 }
995
996
997 skip = radix_to_skip(radix);
998 next_skip = skip / BLIST_META_RADIX;
999 radix /= BLIST_META_RADIX;
1000
1001 for (i = 1; count && i < skip; i += next_skip) {
1002 if (scan[i].bm_bighint == (daddr_t)-1)
1003 break;
1004
1005 if (count >= radix) {
1006 blst_copy(&scan[i], blk, radix, dest, radix);
1007 count -= radix;
1008 } else {
1009 if (count) {
1010 blst_copy(&scan[i], blk, radix, dest, count);
1011 }
1012 count = 0;
1013 }
1014 blk += radix;
1015 }
1016 }
1017
1018 /*
1019 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
1020 *
1021 * This routine allocates all blocks in the specified range
1022 * regardless of any existing allocations in that range. Returns
1023 * the number of blocks allocated by the call.
1024 */
1025 static daddr_t
1026 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
1027 {
1028 daddr_t nblks;
1029 u_daddr_t mask;
1030 int n;
1031
1032 n = blk & BLIST_BMAP_MASK;
1033 mask = ((u_daddr_t)-1 << n) &
1034 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
1035
1036 /* Count the number of blocks that we are allocating. */
1037 nblks = bitcount64(scan->u.bmu_bitmap & mask);
1038
1039 scan->u.bmu_bitmap &= ~mask;
1040 return (nblks);
1041 }
1042
1043 /*
1044 * BLIST_META_FILL() - allocate specific blocks at a meta node
1045 *
1046 * This routine allocates the specified range of blocks,
1047 * regardless of any existing allocations in the range. The
1048 * range must be within the extent of this node. Returns the
1049 * number of blocks allocated by the call.
1050 */
1051 static daddr_t
1052 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1053 {
1054 daddr_t blk, i, nblks, next_skip, skip, v;
1055 int child;
1056
1057 if (scan->bm_bighint == (daddr_t)-1)
1058 panic("filling invalid range");
1059 if (count > radix) {
1060 /*
1061 * The allocation exceeds the number of blocks that are
1062 * managed by this node.
1063 */
1064 panic("fill too large");
1065 }
1066 if (radix == BLIST_BMAP_RADIX)
1067 return (blst_leaf_fill(scan, allocBlk, count));
1068 if (count == radix || scan->u.bmu_avail == 0) {
1069 /*
1070 * ALL-ALLOCATED special case
1071 */
1072 nblks = scan->u.bmu_avail;
1073 scan->u.bmu_avail = 0;
1074 scan->bm_bighint = 0;
1075 return (nblks);
1076 }
1077 skip = radix_to_skip(radix);
1078 next_skip = skip / BLIST_META_RADIX;
1079 blk = allocBlk & -radix;
1080
1081 /*
1082 * An ALL-FREE meta node requires special handling before allocating
1083 * any of its blocks.
1084 */
1085 if (scan->u.bmu_avail == radix) {
1086 radix /= BLIST_META_RADIX;
1087
1088 /*
1089 * Reinitialize each of the meta node's children. An ALL-FREE
1090 * meta node cannot have a terminator in any subtree.
1091 */
1092 for (i = 1; i < skip; i += next_skip) {
1093 if (next_skip == 1)
1094 scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1095 else
1096 scan[i].u.bmu_avail = radix;
1097 scan[i].bm_bighint = radix;
1098 }
1099 } else {
1100 radix /= BLIST_META_RADIX;
1101 }
1102
1103 nblks = 0;
1104 child = (allocBlk - blk) / radix;
1105 blk += child * radix;
1106 i = 1 + child * next_skip;
1107 while (i < skip && blk < allocBlk + count) {
1108 v = blk + radix - allocBlk;
1109 if (v > count)
1110 v = count;
1111 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1112 count -= v;
1113 allocBlk += v;
1114 blk += radix;
1115 i += next_skip;
1116 }
1117 scan->u.bmu_avail -= nblks;
1118 return (nblks);
1119 }
1120
1121 #ifdef BLIST_DEBUG
1122
1123 static void
1124 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1125 {
1126 daddr_t i, next_skip, skip;
1127
1128 if (radix == BLIST_BMAP_RADIX) {
1129 printf(
1130 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1131 tab, tab, "",
1132 (long long)blk, (long long)radix,
1133 (long long)scan->u.bmu_bitmap,
1134 (long long)scan->bm_bighint
1135 );
1136 return;
1137 }
1138
1139 if (scan->u.bmu_avail == 0) {
1140 printf(
1141 "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1142 tab, tab, "",
1143 (long long)blk,
1144 (long long)radix
1145 );
1146 return;
1147 }
1148 if (scan->u.bmu_avail == radix) {
1149 printf(
1150 "%*.*s(%08llx,%lld) ALL FREE\n",
1151 tab, tab, "",
1152 (long long)blk,
1153 (long long)radix
1154 );
1155 return;
1156 }
1157
1158 printf(
1159 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1160 tab, tab, "",
1161 (long long)blk, (long long)radix,
1162 (long long)scan->u.bmu_avail,
1163 (long long)radix,
1164 (long long)scan->bm_bighint
1165 );
1166
1167 skip = radix_to_skip(radix);
1168 next_skip = skip / BLIST_META_RADIX;
1169 radix /= BLIST_META_RADIX;
1170 tab += 4;
1171
1172 for (i = 1; i < skip; i += next_skip) {
1173 if (scan[i].bm_bighint == (daddr_t)-1) {
1174 printf(
1175 "%*.*s(%08llx,%lld): Terminator\n",
1176 tab, tab, "",
1177 (long long)blk, (long long)radix
1178 );
1179 break;
1180 }
1181 blst_radix_print(&scan[i], blk, radix, tab);
1182 blk += radix;
1183 }
1184 tab -= 4;
1185
1186 printf(
1187 "%*.*s}\n",
1188 tab, tab, ""
1189 );
1190 }
1191
1192 #endif
1193
1194 #ifdef BLIST_DEBUG
1195
1196 int
1197 main(int ac, char **av)
1198 {
1199 int size = 1024;
1200 int i;
1201 blist_t bl;
1202 struct sbuf *s;
1203
1204 for (i = 1; i < ac; ++i) {
1205 const char *ptr = av[i];
1206 if (*ptr != '-') {
1207 size = strtol(ptr, NULL, 0);
1208 continue;
1209 }
1210 ptr += 2;
1211 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1212 exit(1);
1213 }
1214 bl = blist_create(size, M_WAITOK);
1215 blist_free(bl, 0, size);
1216
1217 for (;;) {
1218 char buf[1024];
1219 long long da = 0;
1220 long long count = 0;
1221
1222 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1223 (long long)size, (long long)bl->bl_radix);
1224 fflush(stdout);
1225 if (fgets(buf, sizeof(buf), stdin) == NULL)
1226 break;
1227 switch(buf[0]) {
1228 case 'r':
1229 if (sscanf(buf + 1, "%lld", &count) == 1) {
1230 blist_resize(&bl, count, 1, M_WAITOK);
1231 } else {
1232 printf("?\n");
1233 }
1234 case 'p':
1235 blist_print(bl);
1236 break;
1237 case 's':
1238 s = sbuf_new_auto();
1239 blist_stats(bl, s);
1240 sbuf_finish(s);
1241 printf("%s", sbuf_data(s));
1242 sbuf_delete(s);
1243 break;
1244 case 'a':
1245 if (sscanf(buf + 1, "%lld", &count) == 1) {
1246 daddr_t blk = blist_alloc(bl, count);
1247 printf(" R=%08llx\n", (long long)blk);
1248 } else {
1249 printf("?\n");
1250 }
1251 break;
1252 case 'f':
1253 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1254 blist_free(bl, da, count);
1255 } else {
1256 printf("?\n");
1257 }
1258 break;
1259 case 'l':
1260 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1261 printf(" n=%jd\n",
1262 (intmax_t)blist_fill(bl, da, count));
1263 } else {
1264 printf("?\n");
1265 }
1266 break;
1267 case '?':
1268 case 'h':
1269 puts(
1270 "p -print\n"
1271 "s -stats\n"
1272 "a %d -allocate\n"
1273 "f %x %d -free\n"
1274 "l %x %d -fill\n"
1275 "r %d -resize\n"
1276 "h/? -help"
1277 );
1278 break;
1279 default:
1280 printf("?\n");
1281 break;
1282 }
1283 }
1284 return(0);
1285 }
1286
1287 void
1288 panic(const char *ctl, ...)
1289 {
1290 va_list va;
1291
1292 va_start(va, ctl);
1293 vfprintf(stderr, ctl, va);
1294 fprintf(stderr, "\n");
1295 va_end(va);
1296 exit(1);
1297 }
1298
1299 #endif
Cache object: 3d5997f59ae36002182c8f0377f0522b
|