1 /*
2 * Copyright (c) 2013 EMC Corp.
3 * Copyright (c) 2011 Jeffrey Roberson <jeff@freebsd.org>
4 * Copyright (c) 2008 Mayur Shardul <mayur.shardul@gmail.com>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 */
29
30 /*
31 * Path-compressed radix trie implementation.
32 *
33 * The implementation takes into account the following rationale:
34 * - Size of the nodes should be as small as possible but still big enough
35 * to avoid a large maximum depth for the trie. This is a balance
36 * between the necessity to not wire too much physical memory for the nodes
37 * and the necessity to avoid too much cache pollution during the trie
38 * operations.
39 * - There is not a huge bias toward the number of lookup operations over
40 * the number of insert and remove operations. This basically implies
41 * that optimizations supposedly helping one operation but hurting the
42 * other might be carefully evaluated.
43 * - On average not many nodes are expected to be fully populated, hence
44 * level compression may just complicate things.
45 */
46
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49
50 #include "opt_ddb.h"
51
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/kernel.h>
55 #include <sys/pctrie.h>
56
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60
61 #define PCTRIE_MASK (PCTRIE_COUNT - 1)
62 #define PCTRIE_LIMIT (howmany(sizeof(uint64_t) * NBBY, PCTRIE_WIDTH) - 1)
63
64 /* Flag bits stored in node pointers. */
65 #define PCTRIE_ISLEAF 0x1
66 #define PCTRIE_FLAGS 0x1
67 #define PCTRIE_PAD PCTRIE_FLAGS
68
69 /* Returns one unit associated with specified level. */
70 #define PCTRIE_UNITLEVEL(lev) \
71 ((uint64_t)1 << ((lev) * PCTRIE_WIDTH))
72
73 struct pctrie_node {
74 uint64_t pn_owner; /* Owner of record. */
75 uint16_t pn_count; /* Valid children. */
76 uint16_t pn_clev; /* Current level. */
77 void *pn_child[PCTRIE_COUNT]; /* Child nodes. */
78 };
79
80 /*
81 * Allocate a node. Pre-allocation should ensure that the request
82 * will always be satisfied.
83 */
84 static __inline struct pctrie_node *
85 pctrie_node_get(struct pctrie *ptree, pctrie_alloc_t allocfn, uint64_t owner,
86 uint16_t count, uint16_t clevel)
87 {
88 struct pctrie_node *node;
89
90 node = allocfn(ptree);
91 if (node == NULL)
92 return (NULL);
93 node->pn_owner = owner;
94 node->pn_count = count;
95 node->pn_clev = clevel;
96
97 return (node);
98 }
99
100 /*
101 * Free radix node.
102 */
103 static __inline void
104 pctrie_node_put(struct pctrie *ptree, struct pctrie_node *node,
105 pctrie_free_t freefn)
106 {
107 #ifdef INVARIANTS
108 int slot;
109
110 KASSERT(node->pn_count == 0,
111 ("pctrie_node_put: node %p has %d children", node,
112 node->pn_count));
113 for (slot = 0; slot < PCTRIE_COUNT; slot++)
114 KASSERT(node->pn_child[slot] == NULL,
115 ("pctrie_node_put: node %p has a child", node));
116 #endif
117 freefn(ptree, node);
118 }
119
120 /*
121 * Return the position in the array for a given level.
122 */
123 static __inline int
124 pctrie_slot(uint64_t index, uint16_t level)
125 {
126
127 return ((index >> (level * PCTRIE_WIDTH)) & PCTRIE_MASK);
128 }
129
130 /* Trims the key after the specified level. */
131 static __inline uint64_t
132 pctrie_trimkey(uint64_t index, uint16_t level)
133 {
134 uint64_t ret;
135
136 ret = index;
137 if (level > 0) {
138 ret >>= level * PCTRIE_WIDTH;
139 ret <<= level * PCTRIE_WIDTH;
140 }
141 return (ret);
142 }
143
144 /*
145 * Get the root node for a tree.
146 */
147 static __inline struct pctrie_node *
148 pctrie_getroot(struct pctrie *ptree)
149 {
150
151 return ((struct pctrie_node *)ptree->pt_root);
152 }
153
154 /*
155 * Set the root node for a tree.
156 */
157 static __inline void
158 pctrie_setroot(struct pctrie *ptree, struct pctrie_node *node)
159 {
160
161 ptree->pt_root = (uintptr_t)node;
162 }
163
164 /*
165 * Returns TRUE if the specified node is a leaf and FALSE otherwise.
166 */
167 static __inline boolean_t
168 pctrie_isleaf(struct pctrie_node *node)
169 {
170
171 return (((uintptr_t)node & PCTRIE_ISLEAF) != 0);
172 }
173
174 /*
175 * Returns the associated val extracted from node.
176 */
177 static __inline uint64_t *
178 pctrie_toval(struct pctrie_node *node)
179 {
180
181 return ((uint64_t *)((uintptr_t)node & ~PCTRIE_FLAGS));
182 }
183
184 /*
185 * Adds the val as a child of the provided node.
186 */
187 static __inline void
188 pctrie_addval(struct pctrie_node *node, uint64_t index, uint16_t clev,
189 uint64_t *val)
190 {
191 int slot;
192
193 slot = pctrie_slot(index, clev);
194 node->pn_child[slot] = (void *)((uintptr_t)val | PCTRIE_ISLEAF);
195 }
196
197 /*
198 * Returns the slot where two keys differ.
199 * It cannot accept 2 equal keys.
200 */
201 static __inline uint16_t
202 pctrie_keydiff(uint64_t index1, uint64_t index2)
203 {
204 uint16_t clev;
205
206 KASSERT(index1 != index2, ("%s: passing the same key value %jx",
207 __func__, (uintmax_t)index1));
208
209 index1 ^= index2;
210 for (clev = PCTRIE_LIMIT;; clev--)
211 if (pctrie_slot(index1, clev) != 0)
212 return (clev);
213 }
214
215 /*
216 * Returns TRUE if it can be determined that key does not belong to the
217 * specified node. Otherwise, returns FALSE.
218 */
219 static __inline boolean_t
220 pctrie_keybarr(struct pctrie_node *node, uint64_t idx)
221 {
222
223 if (node->pn_clev < PCTRIE_LIMIT) {
224 idx = pctrie_trimkey(idx, node->pn_clev + 1);
225 return (idx != node->pn_owner);
226 }
227 return (FALSE);
228 }
229
230 /*
231 * Internal helper for pctrie_reclaim_allnodes().
232 * This function is recursive.
233 */
234 static void
235 pctrie_reclaim_allnodes_int(struct pctrie *ptree, struct pctrie_node *node,
236 pctrie_free_t freefn)
237 {
238 int slot;
239
240 KASSERT(node->pn_count <= PCTRIE_COUNT,
241 ("pctrie_reclaim_allnodes_int: bad count in node %p", node));
242 for (slot = 0; node->pn_count != 0; slot++) {
243 if (node->pn_child[slot] == NULL)
244 continue;
245 if (!pctrie_isleaf(node->pn_child[slot]))
246 pctrie_reclaim_allnodes_int(ptree,
247 node->pn_child[slot], freefn);
248 node->pn_child[slot] = NULL;
249 node->pn_count--;
250 }
251 pctrie_node_put(ptree, node, freefn);
252 }
253
254 /*
255 * pctrie node zone initializer.
256 */
257 int
258 pctrie_zone_init(void *mem, int size __unused, int flags __unused)
259 {
260 struct pctrie_node *node;
261
262 node = mem;
263 memset(node->pn_child, 0, sizeof(node->pn_child));
264 return (0);
265 }
266
267 size_t
268 pctrie_node_size(void)
269 {
270
271 return (sizeof(struct pctrie_node));
272 }
273
274 /*
275 * Inserts the key-value pair into the trie.
276 * Panics if the key already exists.
277 */
278 int
279 pctrie_insert(struct pctrie *ptree, uint64_t *val, pctrie_alloc_t allocfn)
280 {
281 uint64_t index, newind;
282 void **parentp;
283 struct pctrie_node *node, *tmp;
284 uint64_t *m;
285 int slot;
286 uint16_t clev;
287
288 index = *val;
289
290 /*
291 * The owner of record for root is not really important because it
292 * will never be used.
293 */
294 node = pctrie_getroot(ptree);
295 if (node == NULL) {
296 ptree->pt_root = (uintptr_t)val | PCTRIE_ISLEAF;
297 return (0);
298 }
299 parentp = (void **)&ptree->pt_root;
300 for (;;) {
301 if (pctrie_isleaf(node)) {
302 m = pctrie_toval(node);
303 if (*m == index)
304 panic("%s: key %jx is already present",
305 __func__, (uintmax_t)index);
306 clev = pctrie_keydiff(*m, index);
307 tmp = pctrie_node_get(ptree, allocfn,
308 pctrie_trimkey(index, clev + 1), 2, clev);
309 if (tmp == NULL)
310 return (ENOMEM);
311 *parentp = tmp;
312 pctrie_addval(tmp, index, clev, val);
313 pctrie_addval(tmp, *m, clev, m);
314 return (0);
315 } else if (pctrie_keybarr(node, index))
316 break;
317 slot = pctrie_slot(index, node->pn_clev);
318 if (node->pn_child[slot] == NULL) {
319 node->pn_count++;
320 pctrie_addval(node, index, node->pn_clev, val);
321 return (0);
322 }
323 parentp = &node->pn_child[slot];
324 node = node->pn_child[slot];
325 }
326
327 /*
328 * A new node is needed because the right insertion level is reached.
329 * Setup the new intermediate node and add the 2 children: the
330 * new object and the older edge.
331 */
332 newind = node->pn_owner;
333 clev = pctrie_keydiff(newind, index);
334 tmp = pctrie_node_get(ptree, allocfn,
335 pctrie_trimkey(index, clev + 1), 2, clev);
336 if (tmp == NULL)
337 return (ENOMEM);
338 *parentp = tmp;
339 pctrie_addval(tmp, index, clev, val);
340 slot = pctrie_slot(newind, clev);
341 tmp->pn_child[slot] = node;
342
343 return (0);
344 }
345
346 /*
347 * Returns the value stored at the index. If the index is not present,
348 * NULL is returned.
349 */
350 uint64_t *
351 pctrie_lookup(struct pctrie *ptree, uint64_t index)
352 {
353 struct pctrie_node *node;
354 uint64_t *m;
355 int slot;
356
357 node = pctrie_getroot(ptree);
358 while (node != NULL) {
359 if (pctrie_isleaf(node)) {
360 m = pctrie_toval(node);
361 if (*m == index)
362 return (m);
363 else
364 break;
365 } else if (pctrie_keybarr(node, index))
366 break;
367 slot = pctrie_slot(index, node->pn_clev);
368 node = node->pn_child[slot];
369 }
370 return (NULL);
371 }
372
373 /*
374 * Look up the nearest entry at a position bigger than or equal to index.
375 */
376 uint64_t *
377 pctrie_lookup_ge(struct pctrie *ptree, uint64_t index)
378 {
379 struct pctrie_node *stack[PCTRIE_LIMIT];
380 uint64_t inc;
381 uint64_t *m;
382 struct pctrie_node *child, *node;
383 #ifdef INVARIANTS
384 int loops = 0;
385 #endif
386 int slot, tos;
387
388 node = pctrie_getroot(ptree);
389 if (node == NULL)
390 return (NULL);
391 else if (pctrie_isleaf(node)) {
392 m = pctrie_toval(node);
393 if (*m >= index)
394 return (m);
395 else
396 return (NULL);
397 }
398 tos = 0;
399 for (;;) {
400 /*
401 * If the keys differ before the current bisection node,
402 * then the search key might rollback to the earliest
403 * available bisection node or to the smallest key
404 * in the current node (if the owner is bigger than the
405 * search key).
406 */
407 if (pctrie_keybarr(node, index)) {
408 if (index > node->pn_owner) {
409 ascend:
410 KASSERT(++loops < 1000,
411 ("pctrie_lookup_ge: too many loops"));
412
413 /*
414 * Pop nodes from the stack until either the
415 * stack is empty or a node that could have a
416 * matching descendant is found.
417 */
418 do {
419 if (tos == 0)
420 return (NULL);
421 node = stack[--tos];
422 } while (pctrie_slot(index,
423 node->pn_clev) == (PCTRIE_COUNT - 1));
424
425 /*
426 * The following computation cannot overflow
427 * because index's slot at the current level
428 * is less than PCTRIE_COUNT - 1.
429 */
430 index = pctrie_trimkey(index,
431 node->pn_clev);
432 index += PCTRIE_UNITLEVEL(node->pn_clev);
433 } else
434 index = node->pn_owner;
435 KASSERT(!pctrie_keybarr(node, index),
436 ("pctrie_lookup_ge: keybarr failed"));
437 }
438 slot = pctrie_slot(index, node->pn_clev);
439 child = node->pn_child[slot];
440 if (pctrie_isleaf(child)) {
441 m = pctrie_toval(child);
442 if (*m >= index)
443 return (m);
444 } else if (child != NULL)
445 goto descend;
446
447 /*
448 * Look for an available edge or val within the current
449 * bisection node.
450 */
451 if (slot < (PCTRIE_COUNT - 1)) {
452 inc = PCTRIE_UNITLEVEL(node->pn_clev);
453 index = pctrie_trimkey(index, node->pn_clev);
454 do {
455 index += inc;
456 slot++;
457 child = node->pn_child[slot];
458 if (pctrie_isleaf(child)) {
459 m = pctrie_toval(child);
460 if (*m >= index)
461 return (m);
462 } else if (child != NULL)
463 goto descend;
464 } while (slot < (PCTRIE_COUNT - 1));
465 }
466 KASSERT(child == NULL || pctrie_isleaf(child),
467 ("pctrie_lookup_ge: child is radix node"));
468
469 /*
470 * If a value or edge bigger than the search slot is not found
471 * in the current node, ascend to the next higher-level node.
472 */
473 goto ascend;
474 descend:
475 KASSERT(node->pn_clev > 0,
476 ("pctrie_lookup_ge: pushing leaf's parent"));
477 KASSERT(tos < PCTRIE_LIMIT,
478 ("pctrie_lookup_ge: stack overflow"));
479 stack[tos++] = node;
480 node = child;
481 }
482 }
483
484 /*
485 * Look up the nearest entry at a position less than or equal to index.
486 */
487 uint64_t *
488 pctrie_lookup_le(struct pctrie *ptree, uint64_t index)
489 {
490 struct pctrie_node *stack[PCTRIE_LIMIT];
491 uint64_t inc;
492 uint64_t *m;
493 struct pctrie_node *child, *node;
494 #ifdef INVARIANTS
495 int loops = 0;
496 #endif
497 int slot, tos;
498
499 node = pctrie_getroot(ptree);
500 if (node == NULL)
501 return (NULL);
502 else if (pctrie_isleaf(node)) {
503 m = pctrie_toval(node);
504 if (*m <= index)
505 return (m);
506 else
507 return (NULL);
508 }
509 tos = 0;
510 for (;;) {
511 /*
512 * If the keys differ before the current bisection node,
513 * then the search key might rollback to the earliest
514 * available bisection node or to the largest key
515 * in the current node (if the owner is smaller than the
516 * search key).
517 */
518 if (pctrie_keybarr(node, index)) {
519 if (index > node->pn_owner) {
520 index = node->pn_owner + PCTRIE_COUNT *
521 PCTRIE_UNITLEVEL(node->pn_clev);
522 } else {
523 ascend:
524 KASSERT(++loops < 1000,
525 ("pctrie_lookup_le: too many loops"));
526
527 /*
528 * Pop nodes from the stack until either the
529 * stack is empty or a node that could have a
530 * matching descendant is found.
531 */
532 do {
533 if (tos == 0)
534 return (NULL);
535 node = stack[--tos];
536 } while (pctrie_slot(index,
537 node->pn_clev) == 0);
538
539 /*
540 * The following computation cannot overflow
541 * because index's slot at the current level
542 * is greater than 0.
543 */
544 index = pctrie_trimkey(index,
545 node->pn_clev);
546 }
547 index--;
548 KASSERT(!pctrie_keybarr(node, index),
549 ("pctrie_lookup_le: keybarr failed"));
550 }
551 slot = pctrie_slot(index, node->pn_clev);
552 child = node->pn_child[slot];
553 if (pctrie_isleaf(child)) {
554 m = pctrie_toval(child);
555 if (*m <= index)
556 return (m);
557 } else if (child != NULL)
558 goto descend;
559
560 /*
561 * Look for an available edge or value within the current
562 * bisection node.
563 */
564 if (slot > 0) {
565 inc = PCTRIE_UNITLEVEL(node->pn_clev);
566 index |= inc - 1;
567 do {
568 index -= inc;
569 slot--;
570 child = node->pn_child[slot];
571 if (pctrie_isleaf(child)) {
572 m = pctrie_toval(child);
573 if (*m <= index)
574 return (m);
575 } else if (child != NULL)
576 goto descend;
577 } while (slot > 0);
578 }
579 KASSERT(child == NULL || pctrie_isleaf(child),
580 ("pctrie_lookup_le: child is radix node"));
581
582 /*
583 * If a value or edge smaller than the search slot is not found
584 * in the current node, ascend to the next higher-level node.
585 */
586 goto ascend;
587 descend:
588 KASSERT(node->pn_clev > 0,
589 ("pctrie_lookup_le: pushing leaf's parent"));
590 KASSERT(tos < PCTRIE_LIMIT,
591 ("pctrie_lookup_le: stack overflow"));
592 stack[tos++] = node;
593 node = child;
594 }
595 }
596
597 /*
598 * Remove the specified index from the tree.
599 * Panics if the key is not present.
600 */
601 void
602 pctrie_remove(struct pctrie *ptree, uint64_t index, pctrie_free_t freefn)
603 {
604 struct pctrie_node *node, *parent;
605 uint64_t *m;
606 int i, slot;
607
608 node = pctrie_getroot(ptree);
609 if (pctrie_isleaf(node)) {
610 m = pctrie_toval(node);
611 if (*m != index)
612 panic("%s: invalid key found", __func__);
613 pctrie_setroot(ptree, NULL);
614 return;
615 }
616 parent = NULL;
617 for (;;) {
618 if (node == NULL)
619 panic("pctrie_remove: impossible to locate the key");
620 slot = pctrie_slot(index, node->pn_clev);
621 if (pctrie_isleaf(node->pn_child[slot])) {
622 m = pctrie_toval(node->pn_child[slot]);
623 if (*m != index)
624 panic("%s: invalid key found", __func__);
625 node->pn_child[slot] = NULL;
626 node->pn_count--;
627 if (node->pn_count > 1)
628 break;
629 for (i = 0; i < PCTRIE_COUNT; i++)
630 if (node->pn_child[i] != NULL)
631 break;
632 KASSERT(i != PCTRIE_COUNT,
633 ("%s: invalid node configuration", __func__));
634 if (parent == NULL)
635 pctrie_setroot(ptree, node->pn_child[i]);
636 else {
637 slot = pctrie_slot(index, parent->pn_clev);
638 KASSERT(parent->pn_child[slot] == node,
639 ("%s: invalid child value", __func__));
640 parent->pn_child[slot] = node->pn_child[i];
641 }
642 node->pn_count--;
643 node->pn_child[i] = NULL;
644 pctrie_node_put(ptree, node, freefn);
645 break;
646 }
647 parent = node;
648 node = node->pn_child[slot];
649 }
650 }
651
652 /*
653 * Remove and free all the nodes from the tree.
654 * This function is recursive but there is a tight control on it as the
655 * maximum depth of the tree is fixed.
656 */
657 void
658 pctrie_reclaim_allnodes(struct pctrie *ptree, pctrie_free_t freefn)
659 {
660 struct pctrie_node *root;
661
662 root = pctrie_getroot(ptree);
663 if (root == NULL)
664 return;
665 pctrie_setroot(ptree, NULL);
666 if (!pctrie_isleaf(root))
667 pctrie_reclaim_allnodes_int(ptree, root, freefn);
668 }
669
670 #ifdef DDB
671 /*
672 * Show details about the given node.
673 */
674 DB_SHOW_COMMAND(pctrienode, db_show_pctrienode)
675 {
676 struct pctrie_node *node;
677 int i;
678
679 if (!have_addr)
680 return;
681 node = (struct pctrie_node *)addr;
682 db_printf("node %p, owner %jx, children count %u, level %u:\n",
683 (void *)node, (uintmax_t)node->pn_owner, node->pn_count,
684 node->pn_clev);
685 for (i = 0; i < PCTRIE_COUNT; i++)
686 if (node->pn_child[i] != NULL)
687 db_printf("slot: %d, val: %p, value: %p, clev: %d\n",
688 i, (void *)node->pn_child[i],
689 pctrie_isleaf(node->pn_child[i]) ?
690 pctrie_toval(node->pn_child[i]) : NULL,
691 node->pn_clev);
692 }
693 #endif /* DDB */
Cache object: 9fc5629391758654f494c5b6de34c51b
|