FreeBSD/Linux Kernel Cross Reference
sys/kern/subr_blist.c
1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
18 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
23 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 /*
30 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
31 *
32 * This module implements a general bitmap allocator/deallocator. The
33 * allocator eats around 2 bits per 'block'. The module does not
34 * try to interpret the meaning of a 'block' other than to return
35 * SWAPBLK_NONE on an allocation failure.
36 *
37 * A radix tree controls access to pieces of the bitmap, and includes
38 * auxiliary information at each interior node about the availabilty of
39 * contiguous free blocks in the subtree rooted at that node. Two radix
40 * constants are involved: one for the size of the bitmaps contained in the
41 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
42 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is
43 * associated with a range of blocks. The root of any subtree stores a
44 * hint field that defines an upper bound on the size of the largest
45 * allocation that can begin in the associated block range. A hint is an
46 * upper bound on a potential allocation, but not necessarily a tight upper
47 * bound.
48 *
49 * The bitmap field in each node directs the search for available blocks.
50 * For a leaf node, a bit is set if the corresponding block is free. For a
51 * meta node, a bit is set if the corresponding subtree contains a free
52 * block somewhere within it. The search at a meta node considers only
53 * children of that node that represent a range that includes a free block.
54 *
55 * The hinting greatly increases code efficiency for allocations while
56 * the general radix structure optimizes both allocations and frees. The
57 * radix tree should be able to operate well no matter how much
58 * fragmentation there is and no matter how large a bitmap is used.
59 *
60 * The blist code wires all necessary memory at creation time. Neither
61 * allocations nor frees require interaction with the memory subsystem.
62 * The non-blocking nature of allocations and frees is required by swap
63 * code (vm/swap_pager.c).
64 *
65 * LAYOUT: The radix tree is laid out recursively using a linear array.
66 * Each meta node is immediately followed (laid out sequentially in
67 * memory) by BLIST_META_RADIX lower level nodes. This is a recursive
68 * structure but one that can be easily scanned through a very simple
69 * 'skip' calculation. The memory allocation is only large enough to
70 * cover the number of blocks requested at creation time. Nodes that
71 * represent blocks beyond that limit, nodes that would never be read
72 * or written, are not allocated, so that the last of the
73 * BLIST_META_RADIX lower level nodes of a some nodes may not be
74 * allocated.
75 *
76 * NOTE: the allocator cannot currently allocate more than
77 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
78 * large' if you try. This is an area that could use improvement. The
79 * radix is large enough that this restriction does not effect the swap
80 * system, though. Currently only the allocation code is affected by
81 * this algorithmic unfeature. The freeing code can handle arbitrary
82 * ranges.
83 *
84 * This code can be compiled stand-alone for debugging.
85 */
86
87 #include <sys/cdefs.h>
88 __FBSDID("$FreeBSD$");
89
90 #ifdef _KERNEL
91
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/lock.h>
95 #include <sys/kernel.h>
96 #include <sys/blist.h>
97 #include <sys/malloc.h>
98 #include <sys/sbuf.h>
99 #include <sys/proc.h>
100 #include <sys/mutex.h>
101
102 #else
103
104 #ifndef BLIST_NO_DEBUG
105 #define BLIST_DEBUG
106 #endif
107
108 #include <sys/errno.h>
109 #include <sys/types.h>
110 #include <sys/malloc.h>
111 #include <sys/sbuf.h>
112 #include <stdio.h>
113 #include <string.h>
114 #include <stddef.h>
115 #include <stdlib.h>
116 #include <stdarg.h>
117 #include <stdbool.h>
118
119 #define bitcount64(x) __bitcount64((uint64_t)(x))
120 #define malloc(a,b,c) calloc(a, 1)
121 #define free(a,b) free(a)
122 #define ummin(a,b) ((a) < (b) ? (a) : (b))
123
124 #include <sys/blist.h>
125
126 void panic(const char *ctl, ...);
127
128 #endif
129
130 /*
131 * static support functions
132 */
133 static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
134 static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
135 u_daddr_t radix);
136 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
137 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
138 u_daddr_t radix);
139 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
140 blist_t dest, daddr_t count);
141 static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
142 static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
143 u_daddr_t radix);
144 #ifndef _KERNEL
145 static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
146 int tab);
147 #endif
148
149 #ifdef _KERNEL
150 static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
151 #endif
152
153 _Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
154 "radix divisibility error");
155 #define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1)
156 #define BLIST_META_MASK (BLIST_META_RADIX - 1)
157
158 /*
159 * For a subtree that can represent the state of up to 'radix' blocks, the
160 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm'
161 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
162 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
163 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip'
164 * in the 'meta' functions that process subtrees. Since integer division
165 * discards remainders, we can express this computation as
166 * skip = (m * m**h) / (m - 1)
167 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
168 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
169 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
170 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
171 * so that simple integer division by a constant can safely be used for the
172 * calculation.
173 */
174 static inline daddr_t
175 radix_to_skip(daddr_t radix)
176 {
177
178 return (radix /
179 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
180 }
181
182 /*
183 * Provide a mask with count bits set, starting as position n.
184 */
185 static inline u_daddr_t
186 bitrange(int n, int count)
187 {
188
189 return (((u_daddr_t)-1 << n) &
190 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - (n + count))));
191 }
192
193
194 /*
195 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
196 * Assumes that the argument has only one bit set.
197 */
198 static inline int
199 bitpos(u_daddr_t mask)
200 {
201 int hi, lo, mid;
202
203 switch (sizeof(mask)) {
204 #ifdef HAVE_INLINE_FFSLL
205 case sizeof(long long):
206 return (ffsll(mask) - 1);
207 #endif
208 default:
209 lo = 0;
210 hi = BLIST_BMAP_RADIX;
211 while (lo + 1 < hi) {
212 mid = (lo + hi) >> 1;
213 if ((mask >> mid) != 0)
214 lo = mid;
215 else
216 hi = mid;
217 }
218 return (lo);
219 }
220 }
221
222 /*
223 * blist_create() - create a blist capable of handling up to the specified
224 * number of blocks
225 *
226 * blocks - must be greater than 0
227 * flags - malloc flags
228 *
229 * The smallest blist consists of a single leaf node capable of
230 * managing BLIST_BMAP_RADIX blocks.
231 */
232 blist_t
233 blist_create(daddr_t blocks, int flags)
234 {
235 blist_t bl;
236 u_daddr_t nodes, radix;
237
238 if (blocks == 0)
239 panic("invalid block count");
240
241 /*
242 * Calculate the radix and node count used for scanning.
243 */
244 nodes = 1;
245 radix = BLIST_BMAP_RADIX;
246 while (radix <= blocks) {
247 nodes += 1 + (blocks - 1) / radix;
248 radix *= BLIST_META_RADIX;
249 }
250
251 bl = malloc(offsetof(struct blist, bl_root[nodes]), M_SWAP, flags |
252 M_ZERO);
253 if (bl == NULL)
254 return (NULL);
255
256 bl->bl_blocks = blocks;
257 bl->bl_radix = radix;
258
259 #if defined(BLIST_DEBUG)
260 printf(
261 "BLIST representing %lld blocks (%lld MB of swap)"
262 ", requiring %lldK of ram\n",
263 (long long)bl->bl_blocks,
264 (long long)bl->bl_blocks * 4 / 1024,
265 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
266 );
267 printf("BLIST raw radix tree contains %lld records\n",
268 (long long)nodes);
269 #endif
270
271 return (bl);
272 }
273
274 void
275 blist_destroy(blist_t bl)
276 {
277
278 free(bl, M_SWAP);
279 }
280
281 /*
282 * blist_alloc() - reserve space in the block bitmap. Return the base
283 * of a contiguous region or SWAPBLK_NONE if space could
284 * not be allocated.
285 */
286 daddr_t
287 blist_alloc(blist_t bl, daddr_t count)
288 {
289 daddr_t blk;
290
291 if (count > BLIST_MAX_ALLOC)
292 panic("allocation too large");
293
294 /*
295 * This loop iterates at most twice. An allocation failure in the
296 * first iteration leads to a second iteration only if the cursor was
297 * non-zero. When the cursor is zero, an allocation failure will
298 * stop further iterations.
299 */
300 for (;;) {
301 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
302 bl->bl_radix);
303 if (blk != SWAPBLK_NONE) {
304 bl->bl_avail -= count;
305 bl->bl_cursor = blk + count;
306 if (bl->bl_cursor == bl->bl_blocks)
307 bl->bl_cursor = 0;
308 return (blk);
309 } else if (bl->bl_cursor == 0)
310 return (SWAPBLK_NONE);
311 bl->bl_cursor = 0;
312 }
313 }
314
315 /*
316 * blist_avail() - return the number of free blocks.
317 */
318 daddr_t
319 blist_avail(blist_t bl)
320 {
321
322 return (bl->bl_avail);
323 }
324
325 /*
326 * blist_free() - free up space in the block bitmap. Return the base
327 * of a contiguous region. Panic if an inconsistancy is
328 * found.
329 */
330 void
331 blist_free(blist_t bl, daddr_t blkno, daddr_t count)
332 {
333
334 if (blkno < 0 || blkno + count > bl->bl_blocks)
335 panic("freeing invalid range");
336 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
337 bl->bl_avail += count;
338 }
339
340 /*
341 * blist_fill() - mark a region in the block bitmap as off-limits
342 * to the allocator (i.e. allocate it), ignoring any
343 * existing allocations. Return the number of blocks
344 * actually filled that were free before the call.
345 */
346 daddr_t
347 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
348 {
349 daddr_t filled;
350
351 if (blkno < 0 || blkno + count > bl->bl_blocks)
352 panic("filling invalid range");
353 filled = blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix);
354 bl->bl_avail -= filled;
355 return (filled);
356 }
357
358 /*
359 * blist_resize() - resize an existing radix tree to handle the
360 * specified number of blocks. This will reallocate
361 * the tree and transfer the previous bitmap to the new
362 * one. When extending the tree you can specify whether
363 * the new blocks are to left allocated or freed.
364 */
365 void
366 blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
367 {
368 blist_t newbl = blist_create(count, flags);
369 blist_t save = *pbl;
370
371 *pbl = newbl;
372 if (count > save->bl_blocks)
373 count = save->bl_blocks;
374 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
375
376 /*
377 * If resizing upwards, should we free the new space or not?
378 */
379 if (freenew && count < newbl->bl_blocks) {
380 blist_free(newbl, count, newbl->bl_blocks - count);
381 }
382 blist_destroy(save);
383 }
384
385 #ifdef BLIST_DEBUG
386
387 /*
388 * blist_print() - dump radix tree
389 */
390 void
391 blist_print(blist_t bl)
392 {
393 printf("BLIST avail = %jd, cursor = %08jx {\n",
394 (uintmax_t)bl->bl_avail, (uintmax_t)bl->bl_cursor);
395
396 if (bl->bl_root->bm_bitmap != 0)
397 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
398 printf("}\n");
399 }
400
401 #endif
402
403 static const u_daddr_t fib[] = {
404 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
405 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
406 514229, 832040, 1346269, 2178309, 3524578,
407 };
408
409 /*
410 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
411 */
412 struct gap_stats {
413 daddr_t start; /* current gap start, or SWAPBLK_NONE */
414 daddr_t num; /* number of gaps observed */
415 daddr_t max; /* largest gap size */
416 daddr_t avg; /* average gap size */
417 daddr_t err; /* sum - num * avg */
418 daddr_t histo[nitems(fib)]; /* # gaps in each size range */
419 int max_bucket; /* last histo elt with nonzero val */
420 };
421
422 /*
423 * gap_stats_counting() - is the state 'counting 1 bits'?
424 * or 'skipping 0 bits'?
425 */
426 static inline bool
427 gap_stats_counting(const struct gap_stats *stats)
428 {
429
430 return (stats->start != SWAPBLK_NONE);
431 }
432
433 /*
434 * init_gap_stats() - initialize stats on gap sizes
435 */
436 static inline void
437 init_gap_stats(struct gap_stats *stats)
438 {
439
440 bzero(stats, sizeof(*stats));
441 stats->start = SWAPBLK_NONE;
442 }
443
444 /*
445 * update_gap_stats() - update stats on gap sizes
446 */
447 static void
448 update_gap_stats(struct gap_stats *stats, daddr_t posn)
449 {
450 daddr_t size;
451 int hi, lo, mid;
452
453 if (!gap_stats_counting(stats)) {
454 stats->start = posn;
455 return;
456 }
457 size = posn - stats->start;
458 stats->start = SWAPBLK_NONE;
459 if (size > stats->max)
460 stats->max = size;
461
462 /*
463 * Find the fibonacci range that contains size,
464 * expecting to find it in an early range.
465 */
466 lo = 0;
467 hi = 1;
468 while (hi < nitems(fib) && fib[hi] <= size) {
469 lo = hi;
470 hi *= 2;
471 }
472 if (hi >= nitems(fib))
473 hi = nitems(fib);
474 while (lo + 1 != hi) {
475 mid = (lo + hi) >> 1;
476 if (fib[mid] <= size)
477 lo = mid;
478 else
479 hi = mid;
480 }
481 stats->histo[lo]++;
482 if (lo > stats->max_bucket)
483 stats->max_bucket = lo;
484 stats->err += size - stats->avg;
485 stats->num++;
486 stats->avg += stats->err / stats->num;
487 stats->err %= stats->num;
488 }
489
490 /*
491 * dump_gap_stats() - print stats on gap sizes
492 */
493 static inline void
494 dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
495 {
496 int i;
497
498 sbuf_printf(s, "number of maximal free ranges: %jd\n",
499 (intmax_t)stats->num);
500 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
501 sbuf_printf(s, "average maximal free range size: %jd\n",
502 (intmax_t)stats->avg);
503 sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
504 sbuf_printf(s, " count | size range\n");
505 sbuf_printf(s, " ----- | ----------\n");
506 for (i = 0; i < stats->max_bucket; i++) {
507 if (stats->histo[i] != 0) {
508 sbuf_printf(s, "%20jd | ",
509 (intmax_t)stats->histo[i]);
510 if (fib[i] != fib[i + 1] - 1)
511 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
512 (intmax_t)fib[i + 1] - 1);
513 else
514 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
515 }
516 }
517 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]);
518 if (stats->histo[i] > 1)
519 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
520 (intmax_t)stats->max);
521 else
522 sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
523 }
524
525 /*
526 * blist_stats() - dump radix tree stats
527 */
528 void
529 blist_stats(blist_t bl, struct sbuf *s)
530 {
531 struct gap_stats gstats;
532 struct gap_stats *stats = &gstats;
533 daddr_t i, nodes, radix;
534 u_daddr_t bit, diff, mask;
535
536 init_gap_stats(stats);
537 nodes = 0;
538 i = bl->bl_radix;
539 while (i < bl->bl_radix + bl->bl_blocks) {
540 /*
541 * Find max size subtree starting at i.
542 */
543 radix = BLIST_BMAP_RADIX;
544 while (((i / radix) & BLIST_META_MASK) == 0)
545 radix *= BLIST_META_RADIX;
546
547 /*
548 * Check for skippable subtrees starting at i.
549 */
550 while (radix > BLIST_BMAP_RADIX) {
551 if (bl->bl_root[nodes].bm_bitmap == 0) {
552 if (gap_stats_counting(stats))
553 update_gap_stats(stats, i);
554 break;
555 }
556
557 /*
558 * Skip subtree root.
559 */
560 nodes++;
561 radix /= BLIST_META_RADIX;
562 }
563 if (radix == BLIST_BMAP_RADIX) {
564 /*
565 * Scan leaf.
566 */
567 mask = bl->bl_root[nodes].bm_bitmap;
568 diff = mask ^ (mask << 1);
569 if (gap_stats_counting(stats))
570 diff ^= 1;
571 while (diff != 0) {
572 bit = diff & -diff;
573 update_gap_stats(stats, i + bitpos(bit));
574 diff ^= bit;
575 }
576 }
577 nodes += radix_to_skip(radix);
578 i += radix;
579 }
580 update_gap_stats(stats, i);
581 dump_gap_stats(stats, s);
582 }
583
584 /************************************************************************
585 * ALLOCATION SUPPORT FUNCTIONS *
586 ************************************************************************
587 *
588 * These support functions do all the actual work. They may seem
589 * rather longish, but that's because I've commented them up. The
590 * actual code is straight forward.
591 *
592 */
593
594 /*
595 * BLST_NEXT_LEAF_ALLOC() - allocate the first few blocks in the next leaf.
596 *
597 * 'scan' is a leaf node, associated with a block containing 'blk'.
598 * The next leaf node could be adjacent, or several nodes away if the
599 * least common ancestor of 'scan' and its neighbor is several levels
600 * up. Use 'blk' to determine how many meta-nodes lie between the
601 * leaves. If the next leaf has enough initial bits set, clear them
602 * and clear the bits in the meta nodes on the path up to the least
603 * common ancestor to mark any subtrees made completely empty.
604 */
605 static int
606 blst_next_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
607 {
608 blmeta_t *next;
609 daddr_t skip;
610 u_daddr_t radix;
611 int digit;
612
613 next = scan + 1;
614 blk += BLIST_BMAP_RADIX;
615 radix = BLIST_BMAP_RADIX;
616 while ((digit = ((blk / radix) & BLIST_META_MASK)) == 0 &&
617 (next->bm_bitmap & 1) == 1) {
618 next++;
619 radix *= BLIST_META_RADIX;
620 }
621 if (((next->bm_bitmap + 1) & ~((u_daddr_t)-1 << count)) != 0) {
622 /*
623 * The next leaf doesn't have enough free blocks at the
624 * beginning to complete the spanning allocation.
625 */
626 return (ENOMEM);
627 }
628 /* Clear the first 'count' bits in the next leaf to allocate. */
629 next->bm_bitmap &= (u_daddr_t)-1 << count;
630
631 /*
632 * Update bitmaps of next-ancestors, up to least common ancestor.
633 */
634 skip = radix_to_skip(radix);
635 while (radix != BLIST_BMAP_RADIX && next->bm_bitmap == 0) {
636 (--next)->bm_bitmap ^= 1;
637 radix /= BLIST_META_RADIX;
638 }
639 if (next->bm_bitmap == 0)
640 scan[-digit * skip].bm_bitmap ^= (u_daddr_t)1 << digit;
641 return (0);
642 }
643
644 /*
645 * BLST_LEAF_ALLOC() - allocate at a leaf in the radix tree (a bitmap).
646 *
647 * This function is the core of the allocator. Its execution time is
648 * proportional to log(count), plus height of the tree if the allocation
649 * crosses a leaf boundary.
650 */
651 static daddr_t
652 blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
653 {
654 u_daddr_t cursor_mask, mask;
655 int count1, hi, lo, num_shifts, range1, range_ext;
656
657 range1 = 0;
658 count1 = count - 1;
659 num_shifts = fls(count1);
660 mask = scan->bm_bitmap;
661 while ((-mask & ~mask) != 0 && num_shifts > 0) {
662 /*
663 * If bit i is set in mask, then bits in [i, i+range1] are set
664 * in scan->bm_bitmap. The value of range1 is equal to count1
665 * >> num_shifts. Grow range1 and reduce num_shifts to 0,
666 * while preserving these invariants. The updates to mask
667 * leave fewer bits set, but each bit that remains set
668 * represents a longer string of consecutive bits set in
669 * scan->bm_bitmap. If more updates to mask cannot clear more
670 * bits, because mask is partitioned with all 0 bits preceding
671 * all 1 bits, the loop terminates immediately.
672 */
673 num_shifts--;
674 range_ext = range1 + ((count1 >> num_shifts) & 1);
675 /*
676 * mask is a signed quantity for the shift because when it is
677 * shifted right, the sign bit should copied; when the last
678 * block of the leaf is free, pretend, for a while, that all the
679 * blocks that follow it are also free.
680 */
681 mask &= (daddr_t)mask >> range_ext;
682 range1 += range_ext;
683 }
684 if (mask == 0) {
685 /*
686 * Update bighint. There is no allocation bigger than range1
687 * starting in this leaf.
688 */
689 scan->bm_bighint = range1;
690 return (SWAPBLK_NONE);
691 }
692
693 /* Discard any candidates that appear before blk. */
694 if ((blk & BLIST_BMAP_MASK) != 0) {
695 cursor_mask = mask & bitrange(0, blk & BLIST_BMAP_MASK);
696 if (cursor_mask != 0) {
697 mask ^= cursor_mask;
698 if (mask == 0)
699 return (SWAPBLK_NONE);
700
701 /*
702 * Bighint change for last block allocation cannot
703 * assume that any other blocks are allocated, so the
704 * bighint cannot be reduced much.
705 */
706 range1 = BLIST_MAX_ALLOC - 1;
707 }
708 blk &= ~BLIST_BMAP_MASK;
709 }
710
711 /*
712 * The least significant set bit in mask marks the start of the first
713 * available range of sufficient size. Clear all the bits but that one,
714 * and then find its position.
715 */
716 mask &= -mask;
717 lo = bitpos(mask);
718
719 hi = lo + count;
720 if (hi > BLIST_BMAP_RADIX) {
721 /*
722 * An allocation within this leaf is impossible, so a successful
723 * allocation depends on the next leaf providing some of the blocks.
724 */
725 if (blst_next_leaf_alloc(scan, blk, hi - BLIST_BMAP_RADIX) != 0)
726 /*
727 * The hint cannot be updated, because the same
728 * allocation request could be satisfied later, by this
729 * leaf, if the state of the next leaf changes, and
730 * without any changes to this leaf.
731 */
732 return (SWAPBLK_NONE);
733 hi = BLIST_BMAP_RADIX;
734 }
735
736 /* Set the bits of mask at position 'lo' and higher. */
737 mask = -mask;
738 if (hi == BLIST_BMAP_RADIX) {
739 /*
740 * Update bighint. There is no allocation bigger than range1
741 * available in this leaf after this allocation completes.
742 */
743 scan->bm_bighint = range1;
744 } else {
745 /* Clear the bits of mask at position 'hi' and higher. */
746 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
747 }
748 /* Clear the allocated bits from this leaf. */
749 scan->bm_bitmap &= ~mask;
750 return (blk + lo);
751 }
752
753 /*
754 * blist_meta_alloc() - allocate at a meta in the radix tree.
755 *
756 * Attempt to allocate at a meta node. If we can't, we update
757 * bighint and return a failure. Updating bighint optimize future
758 * calls that hit this node. We have to check for our collapse cases
759 * and we have a few optimizations strewn in as well.
760 */
761 static daddr_t
762 blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
763 {
764 daddr_t blk, i, r, skip;
765 u_daddr_t bit, mask;
766 bool scan_from_start;
767 int digit;
768
769 if (radix == BLIST_BMAP_RADIX)
770 return (blst_leaf_alloc(scan, cursor, count));
771 blk = cursor & -radix;
772 scan_from_start = (cursor == blk);
773 radix /= BLIST_META_RADIX;
774 skip = radix_to_skip(radix);
775 mask = scan->bm_bitmap;
776
777 /* Discard any candidates that appear before cursor. */
778 digit = (cursor / radix) & BLIST_META_MASK;
779 mask &= (u_daddr_t)-1 << digit;
780 if (mask == 0)
781 return (SWAPBLK_NONE);
782
783 /*
784 * If the first try is for a block that includes the cursor, pre-undo
785 * the digit * radix offset in the first call; otherwise, ignore the
786 * cursor entirely.
787 */
788 if (((mask >> digit) & 1) == 1)
789 cursor -= digit * radix;
790 else
791 cursor = blk;
792
793 /*
794 * Examine the nonempty subtree associated with each bit set in mask.
795 */
796 do {
797 bit = mask & -mask;
798 digit = bitpos(bit);
799 i = 1 + digit * skip;
800 if (count <= scan[i].bm_bighint) {
801 /*
802 * The allocation might fit beginning in the i'th subtree.
803 */
804 r = blst_meta_alloc(&scan[i], cursor + digit * radix,
805 count, radix);
806 if (r != SWAPBLK_NONE) {
807 if (scan[i].bm_bitmap == 0)
808 scan->bm_bitmap ^= bit;
809 return (r);
810 }
811 }
812 cursor = blk;
813 } while ((mask ^= bit) != 0);
814
815 /*
816 * We couldn't allocate count in this subtree. If the whole tree was
817 * scanned, and the last tree node is allocated, update bighint.
818 */
819 if (scan_from_start && !(digit == BLIST_META_RADIX - 1 &&
820 scan[i].bm_bighint == BLIST_MAX_ALLOC))
821 scan->bm_bighint = count - 1;
822
823 return (SWAPBLK_NONE);
824 }
825
826 /*
827 * BLST_LEAF_FREE() - free allocated block from leaf bitmap
828 *
829 */
830 static void
831 blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
832 {
833 u_daddr_t mask;
834
835 /*
836 * free some data in this bitmap
837 * mask=0000111111111110000
838 * \_________/\__/
839 * count n
840 */
841 mask = bitrange(blk & BLIST_BMAP_MASK, count);
842 if (scan->bm_bitmap & mask)
843 panic("freeing free block");
844 scan->bm_bitmap |= mask;
845 }
846
847 /*
848 * BLST_META_FREE() - free allocated blocks from radix tree meta info
849 *
850 * This support routine frees a range of blocks from the bitmap.
851 * The range must be entirely enclosed by this radix node. If a
852 * meta node, we break the range down recursively to free blocks
853 * in subnodes (which means that this code can free an arbitrary
854 * range whereas the allocation code cannot allocate an arbitrary
855 * range).
856 */
857 static void
858 blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
859 {
860 daddr_t blk, endBlk, i, skip;
861 int digit, endDigit;
862
863 /*
864 * We could probably do a better job here. We are required to make
865 * bighint at least as large as the biggest allocable block of data.
866 * If we just shoehorn it, a little extra overhead will be incurred
867 * on the next allocation (but only that one typically).
868 */
869 scan->bm_bighint = BLIST_MAX_ALLOC;
870
871 if (radix == BLIST_BMAP_RADIX)
872 return (blst_leaf_free(scan, freeBlk, count));
873
874 endBlk = ummin(freeBlk + count, (freeBlk + radix) & -radix);
875 radix /= BLIST_META_RADIX;
876 skip = radix_to_skip(radix);
877 blk = freeBlk & -radix;
878 digit = (blk / radix) & BLIST_META_MASK;
879 endDigit = 1 + (((endBlk - 1) / radix) & BLIST_META_MASK);
880 scan->bm_bitmap |= bitrange(digit, endDigit - digit);
881 for (i = 1 + digit * skip; blk < endBlk; i += skip) {
882 blk += radix;
883 count = ummin(blk, endBlk) - freeBlk;
884 blst_meta_free(&scan[i], freeBlk, count, radix);
885 freeBlk = blk;
886 }
887 }
888
889 /*
890 * BLST_COPY() - copy one radix tree to another
891 *
892 * Locates free space in the source tree and frees it in the destination
893 * tree. The space may not already be free in the destination.
894 */
895 static void
896 blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
897 daddr_t count)
898 {
899 daddr_t endBlk, i, skip;
900
901 /*
902 * Leaf node
903 */
904
905 if (radix == BLIST_BMAP_RADIX) {
906 u_daddr_t v = scan->bm_bitmap;
907
908 if (v == (u_daddr_t)-1) {
909 blist_free(dest, blk, count);
910 } else if (v != 0) {
911 int i;
912
913 for (i = 0; i < count; ++i) {
914 if (v & ((u_daddr_t)1 << i))
915 blist_free(dest, blk + i, 1);
916 }
917 }
918 return;
919 }
920
921 /*
922 * Meta node
923 */
924
925 if (scan->bm_bitmap == 0) {
926 /*
927 * Source all allocated, leave dest allocated
928 */
929 return;
930 }
931
932 endBlk = blk + count;
933 radix /= BLIST_META_RADIX;
934 skip = radix_to_skip(radix);
935 for (i = 1; blk < endBlk; i += skip) {
936 blk += radix;
937 count = radix;
938 if (blk >= endBlk)
939 count -= blk - endBlk;
940 blst_copy(&scan[i], blk - radix, radix, dest, count);
941 }
942 }
943
944 /*
945 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
946 *
947 * This routine allocates all blocks in the specified range
948 * regardless of any existing allocations in that range. Returns
949 * the number of blocks allocated by the call.
950 */
951 static daddr_t
952 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
953 {
954 daddr_t nblks;
955 u_daddr_t mask;
956
957 mask = bitrange(blk & BLIST_BMAP_MASK, count);
958
959 /* Count the number of blocks that we are allocating. */
960 nblks = bitcount64(scan->bm_bitmap & mask);
961
962 scan->bm_bitmap &= ~mask;
963 return (nblks);
964 }
965
966 /*
967 * BLIST_META_FILL() - allocate specific blocks at a meta node
968 *
969 * This routine allocates the specified range of blocks,
970 * regardless of any existing allocations in the range. The
971 * range must be within the extent of this node. Returns the
972 * number of blocks allocated by the call.
973 */
974 static daddr_t
975 blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
976 {
977 daddr_t blk, endBlk, i, nblks, skip;
978 int digit;
979
980 if (radix == BLIST_BMAP_RADIX)
981 return (blst_leaf_fill(scan, allocBlk, count));
982
983 endBlk = ummin(allocBlk + count, (allocBlk + radix) & -radix);
984 radix /= BLIST_META_RADIX;
985 skip = radix_to_skip(radix);
986 blk = allocBlk & -radix;
987 nblks = 0;
988 while (blk < endBlk) {
989 digit = (blk / radix) & BLIST_META_MASK;
990 i = 1 + digit * skip;
991 blk += radix;
992 count = ummin(blk, endBlk) - allocBlk;
993 nblks += blst_meta_fill(&scan[i], allocBlk, count, radix);
994 if (scan[i].bm_bitmap == 0)
995 scan->bm_bitmap &= ~((u_daddr_t)1 << digit);
996 allocBlk = blk;
997 }
998 return (nblks);
999 }
1000
1001 #ifdef BLIST_DEBUG
1002
1003 static void
1004 blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1005 {
1006 daddr_t skip;
1007 u_daddr_t bit, mask;
1008 int digit;
1009
1010 if (radix == BLIST_BMAP_RADIX) {
1011 printf(
1012 "%*.*s(%08llx,%lld): bitmap %0*llx big=%lld\n",
1013 tab, tab, "",
1014 (long long)blk, (long long)radix,
1015 1 + (BLIST_BMAP_RADIX - 1) / 4,
1016 (long long)scan->bm_bitmap,
1017 (long long)scan->bm_bighint
1018 );
1019 return;
1020 }
1021
1022 printf(
1023 "%*.*s(%08llx): subtree (%lld/%lld) bitmap %0*llx big=%lld {\n",
1024 tab, tab, "",
1025 (long long)blk, (long long)radix,
1026 (long long)radix,
1027 1 + (BLIST_META_RADIX - 1) / 4,
1028 (long long)scan->bm_bitmap,
1029 (long long)scan->bm_bighint
1030 );
1031
1032 radix /= BLIST_META_RADIX;
1033 skip = radix_to_skip(radix);
1034 tab += 4;
1035
1036 mask = scan->bm_bitmap;
1037 /* Examine the nonempty subtree associated with each bit set in mask */
1038 do {
1039 bit = mask & -mask;
1040 digit = bitpos(bit);
1041 blst_radix_print(&scan[1 + digit * skip], blk + digit * radix,
1042 radix, tab);
1043 } while ((mask ^= bit) != 0);
1044 tab -= 4;
1045
1046 printf(
1047 "%*.*s}\n",
1048 tab, tab, ""
1049 );
1050 }
1051
1052 #endif
1053
1054 #ifdef BLIST_DEBUG
1055
1056 int
1057 main(int ac, char **av)
1058 {
1059 int size = BLIST_META_RADIX * BLIST_BMAP_RADIX;
1060 int i;
1061 blist_t bl;
1062 struct sbuf *s;
1063
1064 for (i = 1; i < ac; ++i) {
1065 const char *ptr = av[i];
1066 if (*ptr != '-') {
1067 size = strtol(ptr, NULL, 0);
1068 continue;
1069 }
1070 ptr += 2;
1071 fprintf(stderr, "Bad option: %s\n", ptr - 2);
1072 exit(1);
1073 }
1074 bl = blist_create(size, M_WAITOK);
1075 blist_free(bl, 0, size);
1076
1077 for (;;) {
1078 char buf[1024];
1079 long long da = 0;
1080 long long count = 0;
1081
1082 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1083 (long long)size, (long long)bl->bl_radix);
1084 fflush(stdout);
1085 if (fgets(buf, sizeof(buf), stdin) == NULL)
1086 break;
1087 switch(buf[0]) {
1088 case 'r':
1089 if (sscanf(buf + 1, "%lld", &count) == 1) {
1090 blist_resize(&bl, count, 1, M_WAITOK);
1091 } else {
1092 printf("?\n");
1093 }
1094 case 'p':
1095 blist_print(bl);
1096 break;
1097 case 's':
1098 s = sbuf_new_auto();
1099 blist_stats(bl, s);
1100 sbuf_finish(s);
1101 printf("%s", sbuf_data(s));
1102 sbuf_delete(s);
1103 break;
1104 case 'a':
1105 if (sscanf(buf + 1, "%lld", &count) == 1) {
1106 daddr_t blk = blist_alloc(bl, count);
1107 printf(" R=%08llx\n", (long long)blk);
1108 } else {
1109 printf("?\n");
1110 }
1111 break;
1112 case 'f':
1113 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1114 blist_free(bl, da, count);
1115 } else {
1116 printf("?\n");
1117 }
1118 break;
1119 case 'l':
1120 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1121 printf(" n=%jd\n",
1122 (intmax_t)blist_fill(bl, da, count));
1123 } else {
1124 printf("?\n");
1125 }
1126 break;
1127 case '?':
1128 case 'h':
1129 puts(
1130 "p -print\n"
1131 "s -stats\n"
1132 "a %d -allocate\n"
1133 "f %x %d -free\n"
1134 "l %x %d -fill\n"
1135 "r %d -resize\n"
1136 "h/? -help"
1137 );
1138 break;
1139 default:
1140 printf("?\n");
1141 break;
1142 }
1143 }
1144 return(0);
1145 }
1146
1147 void
1148 panic(const char *ctl, ...)
1149 {
1150 va_list va;
1151
1152 va_start(va, ctl);
1153 vfprintf(stderr, ctl, va);
1154 fprintf(stderr, "\n");
1155 va_end(va);
1156 exit(1);
1157 }
1158
1159 #endif
Cache object: a682d3592ef5a6c54db53abfb8939e44
|