The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_map.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_map.c,v 1.403 2022/11/23 23:53:53 riastradh Exp $  */
    2 
    3 /*
    4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    5  * Copyright (c) 1991, 1993, The Regents of the University of California.
    6  *
    7  * All rights reserved.
    8  *
    9  * This code is derived from software contributed to Berkeley by
   10  * The Mach Operating System project at Carnegie-Mellon University.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)vm_map.c    8.3 (Berkeley) 1/12/94
   37  * from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
   38  *
   39  *
   40  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
   41  * All rights reserved.
   42  *
   43  * Permission to use, copy, modify and distribute this software and
   44  * its documentation is hereby granted, provided that both the copyright
   45  * notice and this permission notice appear in all copies of the
   46  * software, derivative works or modified versions, and any portions
   47  * thereof, and that both notices appear in supporting documentation.
   48  *
   49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
   50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
   51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
   52  *
   53  * Carnegie Mellon requests users of this software to return to
   54  *
   55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
   56  *  School of Computer Science
   57  *  Carnegie Mellon University
   58  *  Pittsburgh PA 15213-3890
   59  *
   60  * any improvements or extensions that they make and grant Carnegie the
   61  * rights to redistribute these changes.
   62  */
   63 
   64 /*
   65  * uvm_map.c: uvm map operations
   66  */
   67 
   68 #include <sys/cdefs.h>
   69 __KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.403 2022/11/23 23:53:53 riastradh Exp $");
   70 
   71 #include "opt_ddb.h"
   72 #include "opt_pax.h"
   73 #include "opt_uvmhist.h"
   74 #include "opt_uvm.h"
   75 #include "opt_sysv.h"
   76 
   77 #include <sys/param.h>
   78 #include <sys/systm.h>
   79 #include <sys/mman.h>
   80 #include <sys/proc.h>
   81 #include <sys/pool.h>
   82 #include <sys/kernel.h>
   83 #include <sys/mount.h>
   84 #include <sys/pax.h>
   85 #include <sys/vnode.h>
   86 #include <sys/filedesc.h>
   87 #include <sys/lockdebug.h>
   88 #include <sys/atomic.h>
   89 #include <sys/sysctl.h>
   90 #ifndef __USER_VA0_IS_SAFE
   91 #include <sys/kauth.h>
   92 #include "opt_user_va0_disable_default.h"
   93 #endif
   94 
   95 #include <sys/shm.h>
   96 
   97 #include <uvm/uvm.h>
   98 #include <uvm/uvm_readahead.h>
   99 
  100 #if defined(DDB) || defined(DEBUGPRINT)
  101 #include <uvm/uvm_ddb.h>
  102 #endif
  103 
  104 #ifdef UVMHIST
  105 #ifndef UVMHIST_MAPHIST_SIZE
  106 #define UVMHIST_MAPHIST_SIZE 100
  107 #endif
  108 static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
  109 UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
  110 #endif
  111 
  112 #if !defined(UVMMAP_COUNTERS)
  113 
  114 #define UVMMAP_EVCNT_DEFINE(name)       /* nothing */
  115 #define UVMMAP_EVCNT_INCR(ev)           /* nothing */
  116 #define UVMMAP_EVCNT_DECR(ev)           /* nothing */
  117 
  118 #else /* defined(UVMMAP_NOCOUNTERS) */
  119 
  120 #include <sys/evcnt.h>
  121 #define UVMMAP_EVCNT_DEFINE(name) \
  122 struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
  123     "uvmmap", #name); \
  124 EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
  125 #define UVMMAP_EVCNT_INCR(ev)           uvmmap_evcnt_##ev.ev_count++
  126 #define UVMMAP_EVCNT_DECR(ev)           uvmmap_evcnt_##ev.ev_count--
  127 
  128 #endif /* defined(UVMMAP_NOCOUNTERS) */
  129 
  130 UVMMAP_EVCNT_DEFINE(ubackmerge)
  131 UVMMAP_EVCNT_DEFINE(uforwmerge)
  132 UVMMAP_EVCNT_DEFINE(ubimerge)
  133 UVMMAP_EVCNT_DEFINE(unomerge)
  134 UVMMAP_EVCNT_DEFINE(kbackmerge)
  135 UVMMAP_EVCNT_DEFINE(kforwmerge)
  136 UVMMAP_EVCNT_DEFINE(kbimerge)
  137 UVMMAP_EVCNT_DEFINE(knomerge)
  138 UVMMAP_EVCNT_DEFINE(map_call)
  139 UVMMAP_EVCNT_DEFINE(mlk_call)
  140 UVMMAP_EVCNT_DEFINE(mlk_hint)
  141 UVMMAP_EVCNT_DEFINE(mlk_tree)
  142 UVMMAP_EVCNT_DEFINE(mlk_treeloop)
  143 
  144 const char vmmapbsy[] = "vmmapbsy";
  145 
  146 /*
  147  * cache for vmspace structures.
  148  */
  149 
  150 static struct pool_cache uvm_vmspace_cache;
  151 
  152 /*
  153  * cache for dynamically-allocated map entries.
  154  */
  155 
  156 static struct pool_cache uvm_map_entry_cache;
  157 
  158 #ifdef PMAP_GROWKERNEL
  159 /*
  160  * This global represents the end of the kernel virtual address
  161  * space.  If we want to exceed this, we must grow the kernel
  162  * virtual address space dynamically.
  163  *
  164  * Note, this variable is locked by kernel_map's lock.
  165  */
  166 vaddr_t uvm_maxkaddr;
  167 #endif
  168 
  169 #ifndef __USER_VA0_IS_SAFE
  170 #ifndef __USER_VA0_DISABLE_DEFAULT
  171 #define __USER_VA0_DISABLE_DEFAULT 1
  172 #endif
  173 #ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
  174 #undef __USER_VA0_DISABLE_DEFAULT
  175 #define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
  176 #endif
  177 int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
  178 #endif
  179 
  180 /*
  181  * macros
  182  */
  183 
  184 /*
  185  * uvm_map_align_va: round down or up virtual address
  186  */
  187 static __inline void
  188 uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
  189 {
  190 
  191         KASSERT(powerof2(align));
  192 
  193         if (align != 0 && (*vap & (align - 1)) != 0) {
  194                 if (topdown)
  195                         *vap = rounddown2(*vap, align);
  196                 else
  197                         *vap = roundup2(*vap, align);
  198         }
  199 }
  200 
  201 /*
  202  * UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
  203  */
  204 extern struct vm_map *pager_map;
  205 
  206 #define UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
  207     prot, maxprot, inh, adv, wire) \
  208         ((ent)->etype == (type) && \
  209         (((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
  210         (ent)->object.uvm_obj == (uobj) && \
  211         (ent)->protection == (prot) && \
  212         (ent)->max_protection == (maxprot) && \
  213         (ent)->inheritance == (inh) && \
  214         (ent)->advice == (adv) && \
  215         (ent)->wired_count == (wire))
  216 
  217 /*
  218  * uvm_map_entry_link: insert entry into a map
  219  *
  220  * => map must be locked
  221  */
  222 #define uvm_map_entry_link(map, after_where, entry) do { \
  223         uvm_mapent_check(entry); \
  224         (map)->nentries++; \
  225         (entry)->prev = (after_where); \
  226         (entry)->next = (after_where)->next; \
  227         (entry)->prev->next = (entry); \
  228         (entry)->next->prev = (entry); \
  229         uvm_rb_insert((map), (entry)); \
  230 } while (/*CONSTCOND*/ 0)
  231 
  232 /*
  233  * uvm_map_entry_unlink: remove entry from a map
  234  *
  235  * => map must be locked
  236  */
  237 #define uvm_map_entry_unlink(map, entry) do { \
  238         KASSERT((entry) != (map)->first_free); \
  239         KASSERT((entry) != (map)->hint); \
  240         uvm_mapent_check(entry); \
  241         (map)->nentries--; \
  242         (entry)->next->prev = (entry)->prev; \
  243         (entry)->prev->next = (entry)->next; \
  244         uvm_rb_remove((map), (entry)); \
  245 } while (/*CONSTCOND*/ 0)
  246 
  247 /*
  248  * SAVE_HINT: saves the specified entry as the hint for future lookups.
  249  *
  250  * => map need not be locked.
  251  */
  252 #define SAVE_HINT(map, check, value) do { \
  253         if ((map)->hint == (check)) \
  254                 (map)->hint = (value); \
  255 } while (/*CONSTCOND*/ 0)
  256 
  257 /*
  258  * clear_hints: ensure that hints don't point to the entry.
  259  *
  260  * => map must be write-locked.
  261  */
  262 static void
  263 clear_hints(struct vm_map *map, struct vm_map_entry *ent)
  264 {
  265 
  266         SAVE_HINT(map, ent, ent->prev);
  267         if (map->first_free == ent) {
  268                 map->first_free = ent->prev;
  269         }
  270 }
  271 
  272 /*
  273  * VM_MAP_RANGE_CHECK: check and correct range
  274  *
  275  * => map must at least be read locked
  276  */
  277 
  278 #define VM_MAP_RANGE_CHECK(map, start, end) do { \
  279         if (start < vm_map_min(map))            \
  280                 start = vm_map_min(map);        \
  281         if (end > vm_map_max(map))              \
  282                 end = vm_map_max(map);          \
  283         if (start > end)                        \
  284                 start = end;                    \
  285 } while (/*CONSTCOND*/ 0)
  286 
  287 /*
  288  * local prototypes
  289  */
  290 
  291 static struct vm_map_entry *
  292                 uvm_mapent_alloc(struct vm_map *, int);
  293 static void     uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
  294 static void     uvm_mapent_free(struct vm_map_entry *);
  295 #if defined(DEBUG)
  296 static void     _uvm_mapent_check(const struct vm_map_entry *, int);
  297 #define uvm_mapent_check(map)   _uvm_mapent_check(map, __LINE__)
  298 #else /* defined(DEBUG) */
  299 #define uvm_mapent_check(e)     /* nothing */
  300 #endif /* defined(DEBUG) */
  301 
  302 static void     uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
  303 static void     uvm_map_reference_amap(struct vm_map_entry *, int);
  304 static int      uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
  305                     int, struct vm_map_entry *);
  306 static void     uvm_map_unreference_amap(struct vm_map_entry *, int);
  307 
  308 int _uvm_map_sanity(struct vm_map *);
  309 int _uvm_tree_sanity(struct vm_map *);
  310 static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);
  311 
  312 #define ROOT_ENTRY(map)         ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
  313 #define LEFT_ENTRY(entry)       ((struct vm_map_entry *)(entry)->rb_node.rb_left)
  314 #define RIGHT_ENTRY(entry)      ((struct vm_map_entry *)(entry)->rb_node.rb_right)
  315 #define PARENT_ENTRY(map, entry) \
  316         (ROOT_ENTRY(map) == (entry) \
  317             ? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))
  318 
  319 /*
  320  * These get filled in if/when SYSVSHM shared memory code is loaded
  321  *
  322  * We do this with function pointers rather the #ifdef SYSVSHM so the
  323  * SYSVSHM code can be loaded and unloaded
  324  */
  325 void (*uvm_shmexit)(struct vmspace *) = NULL;
  326 void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;
  327 
  328 static int
  329 uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
  330 {
  331         const struct vm_map_entry *eparent = nparent;
  332         const struct vm_map_entry *ekey = nkey;
  333 
  334         KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end);
  335         KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end);
  336 
  337         if (eparent->start < ekey->start)
  338                 return -1;
  339         if (eparent->end >= ekey->start)
  340                 return 1;
  341         return 0;
  342 }
  343 
  344 static int
  345 uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
  346 {
  347         const struct vm_map_entry *eparent = nparent;
  348         const vaddr_t va = *(const vaddr_t *) vkey;
  349 
  350         if (eparent->start < va)
  351                 return -1;
  352         if (eparent->end >= va)
  353                 return 1;
  354         return 0;
  355 }
  356 
  357 static const rb_tree_ops_t uvm_map_tree_ops = {
  358         .rbto_compare_nodes = uvm_map_compare_nodes,
  359         .rbto_compare_key = uvm_map_compare_key,
  360         .rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
  361         .rbto_context = NULL
  362 };
  363 
  364 /*
  365  * uvm_rb_gap: return the gap size between our entry and next entry.
  366  */
  367 static inline vsize_t
  368 uvm_rb_gap(const struct vm_map_entry *entry)
  369 {
  370 
  371         KASSERT(entry->next != NULL);
  372         return entry->next->start - entry->end;
  373 }
  374 
  375 static vsize_t
  376 uvm_rb_maxgap(const struct vm_map_entry *entry)
  377 {
  378         struct vm_map_entry *child;
  379         vsize_t maxgap = entry->gap;
  380 
  381         /*
  382          * We need maxgap to be the largest gap of us or any of our
  383          * descendents.  Since each of our children's maxgap is the
  384          * cached value of their largest gap of themselves or their
  385          * descendents, we can just use that value and avoid recursing
  386          * down the tree to calculate it.
  387          */
  388         if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
  389                 maxgap = child->maxgap;
  390 
  391         if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
  392                 maxgap = child->maxgap;
  393 
  394         return maxgap;
  395 }
  396 
  397 static void
  398 uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
  399 {
  400         struct vm_map_entry *parent;
  401 
  402         KASSERT(entry->gap == uvm_rb_gap(entry));
  403         entry->maxgap = uvm_rb_maxgap(entry);
  404 
  405         while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
  406                 struct vm_map_entry *brother;
  407                 vsize_t maxgap = parent->gap;
  408                 unsigned int which;
  409 
  410                 KDASSERT(parent->gap == uvm_rb_gap(parent));
  411                 if (maxgap < entry->maxgap)
  412                         maxgap = entry->maxgap;
  413                 /*
  414                  * Since we work towards the root, we know entry's maxgap
  415                  * value is OK, but its brothers may now be out-of-date due
  416                  * to rebalancing.  So refresh it.
  417                  */
  418                 which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
  419                 brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
  420                 if (brother != NULL) {
  421                         KDASSERT(brother->gap == uvm_rb_gap(brother));
  422                         brother->maxgap = uvm_rb_maxgap(brother);
  423                         if (maxgap < brother->maxgap)
  424                                 maxgap = brother->maxgap;
  425                 }
  426 
  427                 parent->maxgap = maxgap;
  428                 entry = parent;
  429         }
  430 }
  431 
  432 static void
  433 uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
  434 {
  435         struct vm_map_entry *ret __diagused;
  436 
  437         entry->gap = entry->maxgap = uvm_rb_gap(entry);
  438         if (entry->prev != &map->header)
  439                 entry->prev->gap = uvm_rb_gap(entry->prev);
  440 
  441         ret = rb_tree_insert_node(&map->rb_tree, entry);
  442         KASSERTMSG(ret == entry,
  443             "uvm_rb_insert: map %p: duplicate entry %p", map, ret);
  444 
  445         /*
  446          * If the previous entry is not our immediate left child, then it's an
  447          * ancestor and will be fixed up on the way to the root.  We don't
  448          * have to check entry->prev against &map->header since &map->header
  449          * will never be in the tree.
  450          */
  451         uvm_rb_fixup(map,
  452             LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
  453 }
  454 
  455 static void
  456 uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
  457 {
  458         struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;
  459 
  460         /*
  461          * If we are removing an interior node, then an adjacent node will
  462          * be used to replace its position in the tree.  Therefore we will
  463          * need to fixup the tree starting at the parent of the replacement
  464          * node.  So record their parents for later use.
  465          */
  466         if (entry->prev != &map->header)
  467                 prev_parent = PARENT_ENTRY(map, entry->prev);
  468         if (entry->next != &map->header)
  469                 next_parent = PARENT_ENTRY(map, entry->next);
  470 
  471         rb_tree_remove_node(&map->rb_tree, entry);
  472 
  473         /*
  474          * If the previous node has a new parent, fixup the tree starting
  475          * at the previous node's old parent.
  476          */
  477         if (entry->prev != &map->header) {
  478                 /*
  479                  * Update the previous entry's gap due to our absence.
  480                  */
  481                 entry->prev->gap = uvm_rb_gap(entry->prev);
  482                 uvm_rb_fixup(map, entry->prev);
  483                 if (prev_parent != NULL
  484                     && prev_parent != entry
  485                     && prev_parent != PARENT_ENTRY(map, entry->prev))
  486                         uvm_rb_fixup(map, prev_parent);
  487         }
  488 
  489         /*
  490          * If the next node has a new parent, fixup the tree starting
  491          * at the next node's old parent.
  492          */
  493         if (entry->next != &map->header) {
  494                 uvm_rb_fixup(map, entry->next);
  495                 if (next_parent != NULL
  496                     && next_parent != entry
  497                     && next_parent != PARENT_ENTRY(map, entry->next))
  498                         uvm_rb_fixup(map, next_parent);
  499         }
  500 }
  501 
  502 #if defined(DEBUG)
  503 int uvm_debug_check_map = 0;
  504 int uvm_debug_check_rbtree = 0;
  505 #define uvm_map_check(map, name) \
  506         _uvm_map_check((map), (name), __FILE__, __LINE__)
  507 static void
  508 _uvm_map_check(struct vm_map *map, const char *name,
  509     const char *file, int line)
  510 {
  511 
  512         if ((uvm_debug_check_map && _uvm_map_sanity(map)) ||
  513             (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
  514                 panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
  515                     name, map, file, line);
  516         }
  517 }
  518 #else /* defined(DEBUG) */
  519 #define uvm_map_check(map, name)        /* nothing */
  520 #endif /* defined(DEBUG) */
  521 
  522 #if defined(DEBUG) || defined(DDB)
  523 int
  524 _uvm_map_sanity(struct vm_map *map)
  525 {
  526         bool first_free_found = false;
  527         bool hint_found = false;
  528         const struct vm_map_entry *e;
  529         struct vm_map_entry *hint = map->hint;
  530 
  531         e = &map->header;
  532         for (;;) {
  533                 if (map->first_free == e) {
  534                         first_free_found = true;
  535                 } else if (!first_free_found && e->next->start > e->end) {
  536                         printf("first_free %p should be %p\n",
  537                             map->first_free, e);
  538                         return -1;
  539                 }
  540                 if (hint == e) {
  541                         hint_found = true;
  542                 }
  543 
  544                 e = e->next;
  545                 if (e == &map->header) {
  546                         break;
  547                 }
  548         }
  549         if (!first_free_found) {
  550                 printf("stale first_free\n");
  551                 return -1;
  552         }
  553         if (!hint_found) {
  554                 printf("stale hint\n");
  555                 return -1;
  556         }
  557         return 0;
  558 }
  559 
  560 int
  561 _uvm_tree_sanity(struct vm_map *map)
  562 {
  563         struct vm_map_entry *tmp, *trtmp;
  564         int n = 0, i = 1;
  565 
  566         for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
  567                 if (tmp->gap != uvm_rb_gap(tmp)) {
  568                         printf("%d/%d gap %#lx != %#lx %s\n",
  569                             n + 1, map->nentries,
  570                             (ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
  571                             tmp->next == &map->header ? "(last)" : "");
  572                         goto error;
  573                 }
  574                 /*
  575                  * If any entries are out of order, tmp->gap will be unsigned
  576                  * and will likely exceed the size of the map.
  577                  */
  578                 if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
  579                         printf("too large gap %zu\n", (size_t)tmp->gap);
  580                         goto error;
  581                 }
  582                 n++;
  583         }
  584 
  585         if (n != map->nentries) {
  586                 printf("nentries: %d vs %d\n", n, map->nentries);
  587                 goto error;
  588         }
  589 
  590         trtmp = NULL;
  591         for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
  592                 if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
  593                         printf("maxgap %#lx != %#lx\n",
  594                             (ulong)tmp->maxgap,
  595                             (ulong)uvm_rb_maxgap(tmp));
  596                         goto error;
  597                 }
  598                 if (trtmp != NULL && trtmp->start >= tmp->start) {
  599                         printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
  600                             trtmp->start, tmp->start);
  601                         goto error;
  602                 }
  603 
  604                 trtmp = tmp;
  605         }
  606 
  607         for (tmp = map->header.next; tmp != &map->header;
  608             tmp = tmp->next, i++) {
  609                 trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
  610                 if (trtmp == NULL)
  611                         trtmp = &map->header;
  612                 if (tmp->prev != trtmp) {
  613                         printf("lookup: %d: %p->prev=%p: %p\n",
  614                             i, tmp, tmp->prev, trtmp);
  615                         goto error;
  616                 }
  617                 trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
  618                 if (trtmp == NULL)
  619                         trtmp = &map->header;
  620                 if (tmp->next != trtmp) {
  621                         printf("lookup: %d: %p->next=%p: %p\n",
  622                             i, tmp, tmp->next, trtmp);
  623                         goto error;
  624                 }
  625                 trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
  626                 if (trtmp != tmp) {
  627                         printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
  628                             PARENT_ENTRY(map, tmp));
  629                         goto error;
  630                 }
  631         }
  632 
  633         return (0);
  634  error:
  635         return (-1);
  636 }
  637 #endif /* defined(DEBUG) || defined(DDB) */
  638 
  639 /*
  640  * vm_map_lock: acquire an exclusive (write) lock on a map.
  641  *
  642  * => The locking protocol provides for guaranteed upgrade from shared ->
  643  *    exclusive by whichever thread currently has the map marked busy.
  644  *    See "LOCKING PROTOCOL NOTES" in uvm_map.h.  This is horrible; among
  645  *    other problems, it defeats any fairness guarantees provided by RW
  646  *    locks.
  647  */
  648 
  649 void
  650 vm_map_lock(struct vm_map *map)
  651 {
  652 
  653         for (;;) {
  654                 rw_enter(&map->lock, RW_WRITER);
  655                 if (map->busy == NULL || map->busy == curlwp) {
  656                         break;
  657                 }
  658                 mutex_enter(&map->misc_lock);
  659                 rw_exit(&map->lock);
  660                 if (map->busy != NULL) {
  661                         cv_wait(&map->cv, &map->misc_lock);
  662                 }
  663                 mutex_exit(&map->misc_lock);
  664         }
  665         map->timestamp++;
  666 }
  667 
  668 /*
  669  * vm_map_lock_try: try to lock a map, failing if it is already locked.
  670  */
  671 
  672 bool
  673 vm_map_lock_try(struct vm_map *map)
  674 {
  675 
  676         if (!rw_tryenter(&map->lock, RW_WRITER)) {
  677                 return false;
  678         }
  679         if (map->busy != NULL) {
  680                 rw_exit(&map->lock);
  681                 return false;
  682         }
  683         map->timestamp++;
  684         return true;
  685 }
  686 
  687 /*
  688  * vm_map_unlock: release an exclusive lock on a map.
  689  */
  690 
  691 void
  692 vm_map_unlock(struct vm_map *map)
  693 {
  694 
  695         KASSERT(rw_write_held(&map->lock));
  696         KASSERT(map->busy == NULL || map->busy == curlwp);
  697         rw_exit(&map->lock);
  698 }
  699 
  700 /*
  701  * vm_map_unbusy: mark the map as unbusy, and wake any waiters that
  702  *     want an exclusive lock.
  703  */
  704 
  705 void
  706 vm_map_unbusy(struct vm_map *map)
  707 {
  708 
  709         KASSERT(map->busy == curlwp);
  710 
  711         /*
  712          * Safe to clear 'busy' and 'waiters' with only a read lock held:
  713          *
  714          * o they can only be set with a write lock held
  715          * o writers are blocked out with a read or write hold
  716          * o at any time, only one thread owns the set of values
  717          */
  718         mutex_enter(&map->misc_lock);
  719         map->busy = NULL;
  720         cv_broadcast(&map->cv);
  721         mutex_exit(&map->misc_lock);
  722 }
  723 
  724 /*
  725  * vm_map_lock_read: acquire a shared (read) lock on a map.
  726  */
  727 
  728 void
  729 vm_map_lock_read(struct vm_map *map)
  730 {
  731 
  732         rw_enter(&map->lock, RW_READER);
  733 }
  734 
  735 /*
  736  * vm_map_unlock_read: release a shared lock on a map.
  737  */
  738 
  739 void
  740 vm_map_unlock_read(struct vm_map *map)
  741 {
  742 
  743         rw_exit(&map->lock);
  744 }
  745 
  746 /*
  747  * vm_map_busy: mark a map as busy.
  748  *
  749  * => the caller must hold the map write locked
  750  */
  751 
  752 void
  753 vm_map_busy(struct vm_map *map)
  754 {
  755 
  756         KASSERT(rw_write_held(&map->lock));
  757         KASSERT(map->busy == NULL);
  758 
  759         map->busy = curlwp;
  760 }
  761 
  762 /*
  763  * vm_map_locked_p: return true if the map is write locked.
  764  *
  765  * => only for debug purposes like KASSERTs.
  766  * => should not be used to verify that a map is not locked.
  767  */
  768 
  769 bool
  770 vm_map_locked_p(struct vm_map *map)
  771 {
  772 
  773         return rw_write_held(&map->lock);
  774 }
  775 
  776 /*
  777  * uvm_mapent_alloc: allocate a map entry
  778  */
  779 
  780 static struct vm_map_entry *
  781 uvm_mapent_alloc(struct vm_map *map, int flags)
  782 {
  783         struct vm_map_entry *me;
  784         int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
  785         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
  786 
  787         me = pool_cache_get(&uvm_map_entry_cache, pflags);
  788         if (__predict_false(me == NULL)) {
  789                 return NULL;
  790         }
  791         me->flags = 0;
  792 
  793         UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
  794             (map == kernel_map), 0, 0);
  795         return me;
  796 }
  797 
  798 /*
  799  * uvm_mapent_free: free map entry
  800  */
  801 
  802 static void
  803 uvm_mapent_free(struct vm_map_entry *me)
  804 {
  805         UVMHIST_FUNC(__func__);
  806         UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]",
  807                 (uintptr_t)me, me->flags, 0, 0);
  808         pool_cache_put(&uvm_map_entry_cache, me);
  809 }
  810 
  811 /*
  812  * uvm_mapent_copy: copy a map entry, preserving flags
  813  */
  814 
  815 static inline void
  816 uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
  817 {
  818 
  819         memcpy(dst, src, sizeof(*dst));
  820         dst->flags = 0;
  821 }
  822 
  823 #if defined(DEBUG)
  824 static void
  825 _uvm_mapent_check(const struct vm_map_entry *entry, int line)
  826 {
  827 
  828         if (entry->start >= entry->end) {
  829                 goto bad;
  830         }
  831         if (UVM_ET_ISOBJ(entry)) {
  832                 if (entry->object.uvm_obj == NULL) {
  833                         goto bad;
  834                 }
  835         } else if (UVM_ET_ISSUBMAP(entry)) {
  836                 if (entry->object.sub_map == NULL) {
  837                         goto bad;
  838                 }
  839         } else {
  840                 if (entry->object.uvm_obj != NULL ||
  841                     entry->object.sub_map != NULL) {
  842                         goto bad;
  843                 }
  844         }
  845         if (!UVM_ET_ISOBJ(entry)) {
  846                 if (entry->offset != 0) {
  847                         goto bad;
  848                 }
  849         }
  850 
  851         return;
  852 
  853 bad:
  854         panic("%s: bad entry %p, line %d", __func__, entry, line);
  855 }
  856 #endif /* defined(DEBUG) */
  857 
  858 /*
  859  * uvm_map_entry_unwire: unwire a map entry
  860  *
  861  * => map should be locked by caller
  862  */
  863 
  864 static inline void
  865 uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
  866 {
  867 
  868         entry->wired_count = 0;
  869         uvm_fault_unwire_locked(map, entry->start, entry->end);
  870 }
  871 
  872 
  873 /*
  874  * wrapper for calling amap_ref()
  875  */
  876 static inline void
  877 uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
  878 {
  879 
  880         amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
  881             (entry->end - entry->start) >> PAGE_SHIFT, flags);
  882 }
  883 
  884 
  885 /*
  886  * wrapper for calling amap_unref()
  887  */
  888 static inline void
  889 uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
  890 {
  891 
  892         amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
  893             (entry->end - entry->start) >> PAGE_SHIFT, flags);
  894 }
  895 
  896 
  897 /*
  898  * uvm_map_init: init mapping system at boot time.
  899  */
  900 
  901 void
  902 uvm_map_init(void)
  903 {
  904         /*
  905          * first, init logging system.
  906          */
  907 
  908         UVMHIST_FUNC(__func__);
  909         UVMHIST_LINK_STATIC(maphist);
  910         UVMHIST_LINK_STATIC(pdhist);
  911         UVMHIST_CALLED(maphist);
  912         UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
  913 
  914         /*
  915          * initialize the global lock for kernel map entry.
  916          */
  917 
  918         mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
  919 }
  920 
  921 /*
  922  * uvm_map_init_caches: init mapping system caches.
  923  */
  924 void
  925 uvm_map_init_caches(void)
  926 {
  927         /*
  928          * initialize caches.
  929          */
  930 
  931         pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
  932             coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL,
  933             NULL, NULL);
  934         pool_cache_bootstrap(&uvm_vmspace_cache, sizeof(struct vmspace),
  935             0, 0, 0, "vmsppl", NULL, IPL_NONE, NULL, NULL, NULL);
  936 }
  937 
  938 /*
  939  * clippers
  940  */
  941 
  942 /*
  943  * uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
  944  */
  945 
  946 static void
  947 uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
  948     vaddr_t splitat)
  949 {
  950         vaddr_t adj;
  951 
  952         KASSERT(entry1->start < splitat);
  953         KASSERT(splitat < entry1->end);
  954 
  955         adj = splitat - entry1->start;
  956         entry1->end = entry2->start = splitat;
  957 
  958         if (entry1->aref.ar_amap) {
  959                 amap_splitref(&entry1->aref, &entry2->aref, adj);
  960         }
  961         if (UVM_ET_ISSUBMAP(entry1)) {
  962                 /* ... unlikely to happen, but play it safe */
  963                  uvm_map_reference(entry1->object.sub_map);
  964         } else if (UVM_ET_ISOBJ(entry1)) {
  965                 KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
  966                 entry2->offset += adj;
  967                 if (entry1->object.uvm_obj->pgops &&
  968                     entry1->object.uvm_obj->pgops->pgo_reference)
  969                         entry1->object.uvm_obj->pgops->pgo_reference(
  970                             entry1->object.uvm_obj);
  971         }
  972 }
  973 
  974 /*
  975  * uvm_map_clip_start: ensure that the entry begins at or after
  976  *      the starting address, if it doesn't we split the entry.
  977  *
  978  * => caller should use UVM_MAP_CLIP_START macro rather than calling
  979  *    this directly
  980  * => map must be locked by caller
  981  */
  982 
  983 void
  984 uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
  985     vaddr_t start)
  986 {
  987         struct vm_map_entry *new_entry;
  988 
  989         /* uvm_map_simplify_entry(map, entry); */ /* XXX */
  990 
  991         uvm_map_check(map, "clip_start entry");
  992         uvm_mapent_check(entry);
  993 
  994         /*
  995          * Split off the front portion.  note that we must insert the new
  996          * entry BEFORE this one, so that this entry has the specified
  997          * starting address.
  998          */
  999         new_entry = uvm_mapent_alloc(map, 0);
 1000         uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
 1001         uvm_mapent_splitadj(new_entry, entry, start);
 1002         uvm_map_entry_link(map, entry->prev, new_entry);
 1003 
 1004         uvm_map_check(map, "clip_start leave");
 1005 }
 1006 
 1007 /*
 1008  * uvm_map_clip_end: ensure that the entry ends at or before
 1009  *      the ending address, if it does't we split the reference
 1010  *
 1011  * => caller should use UVM_MAP_CLIP_END macro rather than calling
 1012  *    this directly
 1013  * => map must be locked by caller
 1014  */
 1015 
 1016 void
 1017 uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
 1018 {
 1019         struct vm_map_entry *new_entry;
 1020 
 1021         uvm_map_check(map, "clip_end entry");
 1022         uvm_mapent_check(entry);
 1023 
 1024         /*
 1025          *      Create a new entry and insert it
 1026          *      AFTER the specified entry
 1027          */
 1028         new_entry = uvm_mapent_alloc(map, 0);
 1029         uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
 1030         uvm_mapent_splitadj(entry, new_entry, end);
 1031         uvm_map_entry_link(map, entry, new_entry);
 1032 
 1033         uvm_map_check(map, "clip_end leave");
 1034 }
 1035 
 1036 /*
 1037  *   M A P   -   m a i n   e n t r y   p o i n t
 1038  */
 1039 /*
 1040  * uvm_map: establish a valid mapping in a map
 1041  *
 1042  * => assume startp is page aligned.
 1043  * => assume size is a multiple of PAGE_SIZE.
 1044  * => assume sys_mmap provides enough of a "hint" to have us skip
 1045  *      over text/data/bss area.
 1046  * => map must be unlocked (we will lock it)
 1047  * => <uobj,uoffset> value meanings (4 cases):
 1048  *       [1] <NULL,uoffset>             == uoffset is a hint for PMAP_PREFER
 1049  *       [2] <NULL,UVM_UNKNOWN_OFFSET>  == don't PMAP_PREFER
 1050  *       [3] <uobj,uoffset>             == normal mapping
 1051  *       [4] <uobj,UVM_UNKNOWN_OFFSET>  == uvm_map finds offset based on VA
 1052  *
 1053  *    case [4] is for kernel mappings where we don't know the offset until
 1054  *    we've found a virtual address.   note that kernel object offsets are
 1055  *    always relative to vm_map_min(kernel_map).
 1056  *
 1057  * => if `align' is non-zero, we align the virtual address to the specified
 1058  *      alignment.
 1059  *      this is provided as a mechanism for large pages.
 1060  *
 1061  * => XXXCDC: need way to map in external amap?
 1062  */
 1063 
 1064 int
 1065 uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
 1066     struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
 1067 {
 1068         struct uvm_map_args args;
 1069         struct vm_map_entry *new_entry;
 1070         int error;
 1071 
 1072         KASSERT((size & PAGE_MASK) == 0);
 1073         KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
 1074 
 1075         /*
 1076          * for pager_map, allocate the new entry first to avoid sleeping
 1077          * for memory while we have the map locked.
 1078          */
 1079 
 1080         new_entry = NULL;
 1081         if (map == pager_map) {
 1082                 new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
 1083                 if (__predict_false(new_entry == NULL))
 1084                         return ENOMEM;
 1085         }
 1086         if (map == pager_map)
 1087                 flags |= UVM_FLAG_NOMERGE;
 1088 
 1089         error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
 1090             flags, &args);
 1091         if (!error) {
 1092                 error = uvm_map_enter(map, &args, new_entry);
 1093                 *startp = args.uma_start;
 1094         } else if (new_entry) {
 1095                 uvm_mapent_free(new_entry);
 1096         }
 1097 
 1098 #if defined(DEBUG)
 1099         if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
 1100                 uvm_km_check_empty(map, *startp, *startp + size);
 1101         }
 1102 #endif /* defined(DEBUG) */
 1103 
 1104         return error;
 1105 }
 1106 
 1107 /*
 1108  * uvm_map_prepare:
 1109  *
 1110  * called with map unlocked.
 1111  * on success, returns the map locked.
 1112  */
 1113 
 1114 int
 1115 uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
 1116     struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
 1117     struct uvm_map_args *args)
 1118 {
 1119         struct vm_map_entry *prev_entry;
 1120         vm_prot_t prot = UVM_PROTECTION(flags);
 1121         vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
 1122 
 1123         UVMHIST_FUNC(__func__);
 1124         UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)",
 1125             (uintptr_t)map, start, size, flags);
 1126         UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
 1127             uoffset,0,0);
 1128 
 1129         /*
 1130          * detect a popular device driver bug.
 1131          */
 1132 
 1133         KASSERT(doing_shutdown || curlwp != NULL);
 1134 
 1135         /*
 1136          * zero-sized mapping doesn't make any sense.
 1137          */
 1138         KASSERT(size > 0);
 1139 
 1140         KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0);
 1141 
 1142         uvm_map_check(map, "map entry");
 1143 
 1144         /*
 1145          * check sanity of protection code
 1146          */
 1147 
 1148         if ((prot & maxprot) != prot) {
 1149                 UVMHIST_LOG(maphist, "<- prot. failure:  prot=%#jx, max=%#jx",
 1150                 prot, maxprot,0,0);
 1151                 return EACCES;
 1152         }
 1153 
 1154         /*
 1155          * figure out where to put new VM range
 1156          */
 1157 retry:
 1158         if (vm_map_lock_try(map) == false) {
 1159                 if ((flags & UVM_FLAG_TRYLOCK) != 0) {
 1160                         return EAGAIN;
 1161                 }
 1162                 vm_map_lock(map); /* could sleep here */
 1163         }
 1164         if (flags & UVM_FLAG_UNMAP) {
 1165                 KASSERT(flags & UVM_FLAG_FIXED);
 1166                 KASSERT((flags & UVM_FLAG_NOWAIT) == 0);
 1167 
 1168                 /*
 1169                  * Set prev_entry to what it will need to be after any existing
 1170                  * entries are removed later in uvm_map_enter().
 1171                  */
 1172 
 1173                 if (uvm_map_lookup_entry(map, start, &prev_entry)) {
 1174                         if (start == prev_entry->start)
 1175                                 prev_entry = prev_entry->prev;
 1176                         else
 1177                                 UVM_MAP_CLIP_END(map, prev_entry, start);
 1178                         SAVE_HINT(map, map->hint, prev_entry);
 1179                 }
 1180         } else {
 1181                 prev_entry = uvm_map_findspace(map, start, size, &start,
 1182                     uobj, uoffset, align, flags);
 1183         }
 1184         if (prev_entry == NULL) {
 1185                 unsigned int timestamp;
 1186 
 1187                 timestamp = map->timestamp;
 1188                 UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
 1189                             timestamp,0,0,0);
 1190                 map->flags |= VM_MAP_WANTVA;
 1191                 vm_map_unlock(map);
 1192 
 1193                 /*
 1194                  * try to reclaim kva and wait until someone does unmap.
 1195                  * fragile locking here, so we awaken every second to
 1196                  * recheck the condition.
 1197                  */
 1198 
 1199                 mutex_enter(&map->misc_lock);
 1200                 while ((map->flags & VM_MAP_WANTVA) != 0 &&
 1201                    map->timestamp == timestamp) {
 1202                         if ((flags & UVM_FLAG_WAITVA) == 0) {
 1203                                 mutex_exit(&map->misc_lock);
 1204                                 UVMHIST_LOG(maphist,
 1205                                     "<- uvm_map_findspace failed!", 0,0,0,0);
 1206                                 return ENOMEM;
 1207                         } else {
 1208                                 cv_timedwait(&map->cv, &map->misc_lock, hz);
 1209                         }
 1210                 }
 1211                 mutex_exit(&map->misc_lock);
 1212                 goto retry;
 1213         }
 1214 
 1215 #ifdef PMAP_GROWKERNEL
 1216         /*
 1217          * If the kernel pmap can't map the requested space,
 1218          * then allocate more resources for it.
 1219          */
 1220         if (map == kernel_map && uvm_maxkaddr < (start + size))
 1221                 uvm_maxkaddr = pmap_growkernel(start + size);
 1222 #endif
 1223 
 1224         UVMMAP_EVCNT_INCR(map_call);
 1225 
 1226         /*
 1227          * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
 1228          * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET.   in
 1229          * either case we want to zero it  before storing it in the map entry
 1230          * (because it looks strange and confusing when debugging...)
 1231          *
 1232          * if uobj is not null
 1233          *   if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
 1234          *      and we do not need to change uoffset.
 1235          *   if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
 1236          *      now (based on the starting address of the map).   this case is
 1237          *      for kernel object mappings where we don't know the offset until
 1238          *      the virtual address is found (with uvm_map_findspace).   the
 1239          *      offset is the distance we are from the start of the map.
 1240          */
 1241 
 1242         if (uobj == NULL) {
 1243                 uoffset = 0;
 1244         } else {
 1245                 if (uoffset == UVM_UNKNOWN_OFFSET) {
 1246                         KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
 1247                         uoffset = start - vm_map_min(kernel_map);
 1248                 }
 1249         }
 1250 
 1251         args->uma_flags = flags;
 1252         args->uma_prev = prev_entry;
 1253         args->uma_start = start;
 1254         args->uma_size = size;
 1255         args->uma_uobj = uobj;
 1256         args->uma_uoffset = uoffset;
 1257 
 1258         UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
 1259         return 0;
 1260 }
 1261 
 1262 /*
 1263  * uvm_map_enter:
 1264  *
 1265  * called with map locked.
 1266  * unlock the map before returning.
 1267  */
 1268 
 1269 int
 1270 uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
 1271     struct vm_map_entry *new_entry)
 1272 {
 1273         struct vm_map_entry *prev_entry = args->uma_prev;
 1274         struct vm_map_entry *dead = NULL, *dead_entries = NULL;
 1275 
 1276         const uvm_flag_t flags = args->uma_flags;
 1277         const vm_prot_t prot = UVM_PROTECTION(flags);
 1278         const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
 1279         const vm_inherit_t inherit = UVM_INHERIT(flags);
 1280         const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
 1281             AMAP_EXTEND_NOWAIT : 0;
 1282         const int advice = UVM_ADVICE(flags);
 1283 
 1284         vaddr_t start = args->uma_start;
 1285         vsize_t size = args->uma_size;
 1286         struct uvm_object *uobj = args->uma_uobj;
 1287         voff_t uoffset = args->uma_uoffset;
 1288 
 1289         const int kmap = (vm_map_pmap(map) == pmap_kernel());
 1290         int merged = 0;
 1291         int error;
 1292         int newetype;
 1293 
 1294         UVMHIST_FUNC(__func__);
 1295         UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
 1296             (uintptr_t)map, start, size, flags);
 1297         UVMHIST_LOG(maphist, "  uobj/offset %#jx/%jd", (uintptr_t)uobj,
 1298             uoffset,0,0);
 1299 
 1300         KASSERT(map->hint == prev_entry); /* bimerge case assumes this */
 1301         KASSERT(vm_map_locked_p(map));
 1302         KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
 1303                 (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));
 1304 
 1305         if (uobj)
 1306                 newetype = UVM_ET_OBJ;
 1307         else
 1308                 newetype = 0;
 1309 
 1310         if (flags & UVM_FLAG_COPYONW) {
 1311                 newetype |= UVM_ET_COPYONWRITE;
 1312                 if ((flags & UVM_FLAG_OVERLAY) == 0)
 1313                         newetype |= UVM_ET_NEEDSCOPY;
 1314         }
 1315 
 1316         /*
 1317          * For mappings with unmap, remove any old entries now.  Adding the new
 1318          * entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
 1319          * is set, and we do not support nowait and unmap together.
 1320          */
 1321 
 1322         if (flags & UVM_FLAG_UNMAP) {
 1323                 KASSERT(flags & UVM_FLAG_FIXED);
 1324                 uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
 1325 #ifdef DEBUG
 1326                 struct vm_map_entry *tmp_entry __diagused;
 1327                 bool rv __diagused;
 1328 
 1329                 rv = uvm_map_lookup_entry(map, start, &tmp_entry);
 1330                 KASSERT(!rv);
 1331                 KASSERTMSG(prev_entry == tmp_entry,
 1332                            "args %p prev_entry %p tmp_entry %p",
 1333                            args, prev_entry, tmp_entry);
 1334 #endif
 1335                 SAVE_HINT(map, map->hint, prev_entry);
 1336         }
 1337 
 1338         /*
 1339          * try and insert in map by extending previous entry, if possible.
 1340          * XXX: we don't try and pull back the next entry.   might be useful
 1341          * for a stack, but we are currently allocating our stack in advance.
 1342          */
 1343 
 1344         if (flags & UVM_FLAG_NOMERGE)
 1345                 goto nomerge;
 1346 
 1347         if (prev_entry->end == start &&
 1348             prev_entry != &map->header &&
 1349             UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
 1350             prot, maxprot, inherit, advice, 0)) {
 1351 
 1352                 if (uobj && prev_entry->offset +
 1353                     (prev_entry->end - prev_entry->start) != uoffset)
 1354                         goto forwardmerge;
 1355 
 1356                 /*
 1357                  * can't extend a shared amap.  note: no need to lock amap to
 1358                  * look at refs since we don't care about its exact value.
 1359                  * if it is one (i.e. we have only reference) it will stay there
 1360                  */
 1361 
 1362                 if (prev_entry->aref.ar_amap &&
 1363                     amap_refs(prev_entry->aref.ar_amap) != 1) {
 1364                         goto forwardmerge;
 1365                 }
 1366 
 1367                 if (prev_entry->aref.ar_amap) {
 1368                         error = amap_extend(prev_entry, size,
 1369                             amapwaitflag | AMAP_EXTEND_FORWARDS);
 1370                         if (error)
 1371                                 goto nomerge;
 1372                 }
 1373 
 1374                 if (kmap) {
 1375                         UVMMAP_EVCNT_INCR(kbackmerge);
 1376                 } else {
 1377                         UVMMAP_EVCNT_INCR(ubackmerge);
 1378                 }
 1379                 UVMHIST_LOG(maphist,"  starting back merge", 0, 0, 0, 0);
 1380 
 1381                 /*
 1382                  * drop our reference to uobj since we are extending a reference
 1383                  * that we already have (the ref count can not drop to zero).
 1384                  */
 1385 
 1386                 if (uobj && uobj->pgops->pgo_detach)
 1387                         uobj->pgops->pgo_detach(uobj);
 1388 
 1389                 /*
 1390                  * Now that we've merged the entries, note that we've grown
 1391                  * and our gap has shrunk.  Then fix the tree.
 1392                  */
 1393                 prev_entry->end += size;
 1394                 prev_entry->gap -= size;
 1395                 uvm_rb_fixup(map, prev_entry);
 1396 
 1397                 uvm_map_check(map, "map backmerged");
 1398 
 1399                 UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
 1400                 merged++;
 1401         }
 1402 
 1403 forwardmerge:
 1404         if (prev_entry->next->start == (start + size) &&
 1405             prev_entry->next != &map->header &&
 1406             UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
 1407             prot, maxprot, inherit, advice, 0)) {
 1408 
 1409                 if (uobj && prev_entry->next->offset != uoffset + size)
 1410                         goto nomerge;
 1411 
 1412                 /*
 1413                  * can't extend a shared amap.  note: no need to lock amap to
 1414                  * look at refs since we don't care about its exact value.
 1415                  * if it is one (i.e. we have only reference) it will stay there.
 1416                  *
 1417                  * note that we also can't merge two amaps, so if we
 1418                  * merged with the previous entry which has an amap,
 1419                  * and the next entry also has an amap, we give up.
 1420                  *
 1421                  * Interesting cases:
 1422                  * amap, new, amap -> give up second merge (single fwd extend)
 1423                  * amap, new, none -> double forward extend (extend again here)
 1424                  * none, new, amap -> double backward extend (done here)
 1425                  * uobj, new, amap -> single backward extend (done here)
 1426                  *
 1427                  * XXX should we attempt to deal with someone refilling
 1428                  * the deallocated region between two entries that are
 1429                  * backed by the same amap (ie, arefs is 2, "prev" and
 1430                  * "next" refer to it, and adding this allocation will
 1431                  * close the hole, thus restoring arefs to 1 and
 1432                  * deallocating the "next" vm_map_entry)?  -- @@@
 1433                  */
 1434 
 1435                 if (prev_entry->next->aref.ar_amap &&
 1436                     (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
 1437                      (merged && prev_entry->aref.ar_amap))) {
 1438                         goto nomerge;
 1439                 }
 1440 
 1441                 if (merged) {
 1442                         /*
 1443                          * Try to extend the amap of the previous entry to
 1444                          * cover the next entry as well.  If it doesn't work
 1445                          * just skip on, don't actually give up, since we've
 1446                          * already completed the back merge.
 1447                          */
 1448                         if (prev_entry->aref.ar_amap) {
 1449                                 if (amap_extend(prev_entry,
 1450                                     prev_entry->next->end -
 1451                                     prev_entry->next->start,
 1452                                     amapwaitflag | AMAP_EXTEND_FORWARDS))
 1453                                         goto nomerge;
 1454                         }
 1455 
 1456                         /*
 1457                          * Try to extend the amap of the *next* entry
 1458                          * back to cover the new allocation *and* the
 1459                          * previous entry as well (the previous merge
 1460                          * didn't have an amap already otherwise we
 1461                          * wouldn't be checking here for an amap).  If
 1462                          * it doesn't work just skip on, again, don't
 1463                          * actually give up, since we've already
 1464                          * completed the back merge.
 1465                          */
 1466                         else if (prev_entry->next->aref.ar_amap) {
 1467                                 if (amap_extend(prev_entry->next,
 1468                                     prev_entry->end -
 1469                                     prev_entry->start,
 1470                                     amapwaitflag | AMAP_EXTEND_BACKWARDS))
 1471                                         goto nomerge;
 1472                         }
 1473                 } else {
 1474                         /*
 1475                          * Pull the next entry's amap backwards to cover this
 1476                          * new allocation.
 1477                          */
 1478                         if (prev_entry->next->aref.ar_amap) {
 1479                                 error = amap_extend(prev_entry->next, size,
 1480                                     amapwaitflag | AMAP_EXTEND_BACKWARDS);
 1481                                 if (error)
 1482                                         goto nomerge;
 1483                         }
 1484                 }
 1485 
 1486                 if (merged) {
 1487                         if (kmap) {
 1488                                 UVMMAP_EVCNT_DECR(kbackmerge);
 1489                                 UVMMAP_EVCNT_INCR(kbimerge);
 1490                         } else {
 1491                                 UVMMAP_EVCNT_DECR(ubackmerge);
 1492                                 UVMMAP_EVCNT_INCR(ubimerge);
 1493                         }
 1494                 } else {
 1495                         if (kmap) {
 1496                                 UVMMAP_EVCNT_INCR(kforwmerge);
 1497                         } else {
 1498                                 UVMMAP_EVCNT_INCR(uforwmerge);
 1499                         }
 1500                 }
 1501                 UVMHIST_LOG(maphist,"  starting forward merge", 0, 0, 0, 0);
 1502 
 1503                 /*
 1504                  * drop our reference to uobj since we are extending a reference
 1505                  * that we already have (the ref count can not drop to zero).
 1506                  */
 1507                 if (uobj && uobj->pgops->pgo_detach)
 1508                         uobj->pgops->pgo_detach(uobj);
 1509 
 1510                 if (merged) {
 1511                         dead = prev_entry->next;
 1512                         prev_entry->end = dead->end;
 1513                         uvm_map_entry_unlink(map, dead);
 1514                         if (dead->aref.ar_amap != NULL) {
 1515                                 prev_entry->aref = dead->aref;
 1516                                 dead->aref.ar_amap = NULL;
 1517                         }
 1518                 } else {
 1519                         prev_entry->next->start -= size;
 1520                         if (prev_entry != &map->header) {
 1521                                 prev_entry->gap -= size;
 1522                                 KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
 1523                                 uvm_rb_fixup(map, prev_entry);
 1524                         }
 1525                         if (uobj)
 1526                                 prev_entry->next->offset = uoffset;
 1527                 }
 1528 
 1529                 uvm_map_check(map, "map forwardmerged");
 1530 
 1531                 UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
 1532                 merged++;
 1533         }
 1534 
 1535 nomerge:
 1536         if (!merged) {
 1537                 UVMHIST_LOG(maphist,"  allocating new map entry", 0, 0, 0, 0);
 1538                 if (kmap) {
 1539                         UVMMAP_EVCNT_INCR(knomerge);
 1540                 } else {
 1541                         UVMMAP_EVCNT_INCR(unomerge);
 1542                 }
 1543 
 1544                 /*
 1545                  * allocate new entry and link it in.
 1546                  */
 1547 
 1548                 if (new_entry == NULL) {
 1549                         new_entry = uvm_mapent_alloc(map,
 1550                                 (flags & UVM_FLAG_NOWAIT));
 1551                         if (__predict_false(new_entry == NULL)) {
 1552                                 error = ENOMEM;
 1553                                 goto done;
 1554                         }
 1555                 }
 1556                 new_entry->start = start;
 1557                 new_entry->end = new_entry->start + size;
 1558                 new_entry->object.uvm_obj = uobj;
 1559                 new_entry->offset = uoffset;
 1560 
 1561                 new_entry->etype = newetype;
 1562 
 1563                 if (flags & UVM_FLAG_NOMERGE) {
 1564                         new_entry->flags |= UVM_MAP_NOMERGE;
 1565                 }
 1566 
 1567                 new_entry->protection = prot;
 1568                 new_entry->max_protection = maxprot;
 1569                 new_entry->inheritance = inherit;
 1570                 new_entry->wired_count = 0;
 1571                 new_entry->advice = advice;
 1572                 if (flags & UVM_FLAG_OVERLAY) {
 1573 
 1574                         /*
 1575                          * to_add: for BSS we overallocate a little since we
 1576                          * are likely to extend
 1577                          */
 1578 
 1579                         vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
 1580                                 UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
 1581                         struct vm_amap *amap = amap_alloc(size, to_add,
 1582                             (flags & UVM_FLAG_NOWAIT));
 1583                         if (__predict_false(amap == NULL)) {
 1584                                 error = ENOMEM;
 1585                                 goto done;
 1586                         }
 1587                         new_entry->aref.ar_pageoff = 0;
 1588                         new_entry->aref.ar_amap = amap;
 1589                 } else {
 1590                         new_entry->aref.ar_pageoff = 0;
 1591                         new_entry->aref.ar_amap = NULL;
 1592                 }
 1593                 uvm_map_entry_link(map, prev_entry, new_entry);
 1594 
 1595                 /*
 1596                  * Update the free space hint
 1597                  */
 1598 
 1599                 if ((map->first_free == prev_entry) &&
 1600                     (prev_entry->end >= new_entry->start))
 1601                         map->first_free = new_entry;
 1602 
 1603                 new_entry = NULL;
 1604         }
 1605 
 1606         map->size += size;
 1607 
 1608         UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
 1609 
 1610         error = 0;
 1611 
 1612 done:
 1613         vm_map_unlock(map);
 1614 
 1615         if (new_entry) {
 1616                 uvm_mapent_free(new_entry);
 1617         }
 1618         if (dead) {
 1619                 KDASSERT(merged);
 1620                 uvm_mapent_free(dead);
 1621         }
 1622         if (dead_entries)
 1623                 uvm_unmap_detach(dead_entries, 0);
 1624 
 1625         return error;
 1626 }
 1627 
 1628 /*
 1629  * uvm_map_lookup_entry_bytree: lookup an entry in tree
 1630  */
 1631 
 1632 static inline bool
 1633 uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
 1634     struct vm_map_entry **entry /* OUT */)
 1635 {
 1636         struct vm_map_entry *prev = &map->header;
 1637         struct vm_map_entry *cur = ROOT_ENTRY(map);
 1638 
 1639         while (cur) {
 1640                 UVMMAP_EVCNT_INCR(mlk_treeloop);
 1641                 if (address >= cur->start) {
 1642                         if (address < cur->end) {
 1643                                 *entry = cur;
 1644                                 return true;
 1645                         }
 1646                         prev = cur;
 1647                         cur = RIGHT_ENTRY(cur);
 1648                 } else
 1649                         cur = LEFT_ENTRY(cur);
 1650         }
 1651         *entry = prev;
 1652         return false;
 1653 }
 1654 
 1655 /*
 1656  * uvm_map_lookup_entry: find map entry at or before an address
 1657  *
 1658  * => map must at least be read-locked by caller
 1659  * => entry is returned in "entry"
 1660  * => return value is true if address is in the returned entry
 1661  */
 1662 
 1663 bool
 1664 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 1665     struct vm_map_entry **entry /* OUT */)
 1666 {
 1667         struct vm_map_entry *cur;
 1668         UVMHIST_FUNC(__func__);
 1669         UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
 1670             (uintptr_t)map, address, (uintptr_t)entry, 0);
 1671 
 1672         /*
 1673          * make a quick check to see if we are already looking at
 1674          * the entry we want (which is usually the case).  note also
 1675          * that we don't need to save the hint here...  it is the
 1676          * same hint (unless we are at the header, in which case the
 1677          * hint didn't buy us anything anyway).
 1678          */
 1679 
 1680         cur = map->hint;
 1681         UVMMAP_EVCNT_INCR(mlk_call);
 1682         if (cur != &map->header &&
 1683             address >= cur->start && cur->end > address) {
 1684                 UVMMAP_EVCNT_INCR(mlk_hint);
 1685                 *entry = cur;
 1686                 UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
 1687                     (uintptr_t)cur, 0, 0, 0);
 1688                 uvm_mapent_check(*entry);
 1689                 return (true);
 1690         }
 1691         uvm_map_check(map, __func__);
 1692 
 1693         /*
 1694          * lookup in the tree.
 1695          */
 1696 
 1697         UVMMAP_EVCNT_INCR(mlk_tree);
 1698         if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) {
 1699                 SAVE_HINT(map, map->hint, *entry);
 1700                 UVMHIST_LOG(maphist,"<- search got it (%#jx)",
 1701                     (uintptr_t)cur, 0, 0, 0);
 1702                 KDASSERT((*entry)->start <= address);
 1703                 KDASSERT(address < (*entry)->end);
 1704                 uvm_mapent_check(*entry);
 1705                 return (true);
 1706         }
 1707 
 1708         SAVE_HINT(map, map->hint, *entry);
 1709         UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
 1710         KDASSERT((*entry) == &map->header || (*entry)->end <= address);
 1711         KDASSERT((*entry)->next == &map->header ||
 1712             address < (*entry)->next->start);
 1713         return (false);
 1714 }
 1715 
 1716 /*
 1717  * See if the range between start and start + length fits in the gap
 1718  * entry->next->start and entry->end.  Returns 1 if fits, 0 if doesn't
 1719  * fit, and -1 address wraps around.
 1720  */
 1721 static int
 1722 uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
 1723     vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
 1724 {
 1725         vaddr_t end;
 1726 
 1727 #ifdef PMAP_PREFER
 1728         /*
 1729          * push start address forward as needed to avoid VAC alias problems.
 1730          * we only do this if a valid offset is specified.
 1731          */
 1732 
 1733         if (uoffset != UVM_UNKNOWN_OFFSET)
 1734                 PMAP_PREFER(uoffset, start, length, topdown);
 1735 #endif
 1736         if ((flags & UVM_FLAG_COLORMATCH) != 0) {
 1737                 KASSERT(align < uvmexp.ncolors);
 1738                 if (uvmexp.ncolors > 1) {
 1739                         const u_int colormask = uvmexp.colormask;
 1740                         const u_int colorsize = colormask + 1;
 1741                         vaddr_t hint = atop(*start);
 1742                         const u_int color = hint & colormask;
 1743                         if (color != align) {
 1744                                 hint -= color;  /* adjust to color boundary */
 1745                                 KASSERT((hint & colormask) == 0);
 1746                                 if (topdown) {
 1747                                         if (align > color)
 1748                                                 hint -= colorsize;
 1749                                 } else {
 1750                                         if (align < color)
 1751                                                 hint += colorsize;
 1752                                 }
 1753                                 *start = ptoa(hint + align); /* adjust to color */
 1754                         }
 1755                 }
 1756         } else {
 1757                 KASSERT(powerof2(align));
 1758                 uvm_map_align_va(start, align, topdown);
 1759                 /*
 1760                  * XXX Should we PMAP_PREFER() here again?
 1761                  * eh...i think we're okay
 1762                  */
 1763         }
 1764 
 1765         /*
 1766          * Find the end of the proposed new region.  Be sure we didn't
 1767          * wrap around the address; if so, we lose.  Otherwise, if the
 1768          * proposed new region fits before the next entry, we win.
 1769          */
 1770 
 1771         end = *start + length;
 1772         if (end < *start)
 1773                 return (-1);
 1774 
 1775         if (entry->next->start >= end && *start >= entry->end)
 1776                 return (1);
 1777 
 1778         return (0);
 1779 }
 1780 
 1781 static void
 1782 uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length,
 1783     struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags,
 1784     vaddr_t hint, struct vm_map_entry *entry, int line)
 1785 {
 1786         const int topdown = map->flags & VM_MAP_TOPDOWN;
 1787 
 1788         KASSERTMSG( topdown || hint >= orig_hint,
 1789             "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
 1790             " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
 1791             " flags=%#x entry=%p (uvm_map_findspace line %d)",
 1792             map, hint, orig_hint,
 1793             length, uobj, (unsigned long long)uoffset, align,
 1794             flags, entry, line);
 1795         KASSERTMSG(!topdown || hint <= orig_hint,
 1796             "map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
 1797             " length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
 1798             " flags=%#x entry=%p (uvm_map_findspace line %d)",
 1799             map, hint, orig_hint,
 1800             length, uobj, (unsigned long long)uoffset, align,
 1801             flags, entry, line);
 1802 }
 1803 
 1804 /*
 1805  * uvm_map_findspace: find "length" sized space in "map".
 1806  *
 1807  * => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
 1808  *      set in "flags" (in which case we insist on using "hint").
 1809  * => "result" is VA returned
 1810  * => uobj/uoffset are to be used to handle VAC alignment, if required
 1811  * => if "align" is non-zero, we attempt to align to that value.
 1812  * => caller must at least have read-locked map
 1813  * => returns NULL on failure, or pointer to prev. map entry if success
 1814  * => note this is a cross between the old vm_map_findspace and vm_map_find
 1815  */
 1816 
 1817 struct vm_map_entry *
 1818 uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
 1819     vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
 1820     vsize_t align, int flags)
 1821 {
 1822 #define INVARIANTS()                                                          \
 1823         uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\
 1824             flags, hint, entry, __LINE__)
 1825         struct vm_map_entry *entry = NULL;
 1826         struct vm_map_entry *child, *prev, *tmp;
 1827         vaddr_t orig_hint __diagused;
 1828         const int topdown = map->flags & VM_MAP_TOPDOWN;
 1829         int avail;
 1830         UVMHIST_FUNC(__func__);
 1831         UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...",
 1832             (uintptr_t)map, hint, length, flags);
 1833         UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)",
 1834             (uintptr_t)uobj, uoffset, align, 0);
 1835 
 1836         KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align));
 1837         KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors);
 1838         KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
 1839 
 1840         uvm_map_check(map, "map_findspace entry");
 1841 
 1842         /*
 1843          * Clamp the hint to the VM map's min/max address, and remmeber
 1844          * the clamped original hint.  Remember the original hint,
 1845          * clamped to the min/max address.  If we are aligning, then we
 1846          * may have to try again with no alignment constraint if we
 1847          * fail the first time.
 1848          *
 1849          * We use the original hint to verify later that the search has
 1850          * been monotonic -- that is, nonincreasing or nondecreasing,
 1851          * according to topdown or !topdown respectively.  But the
 1852          * clamping is not monotonic.
 1853          */
 1854         if (hint < vm_map_min(map)) {   /* check ranges ... */
 1855                 if (flags & UVM_FLAG_FIXED) {
 1856                         UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
 1857                         return (NULL);
 1858                 }
 1859                 hint = vm_map_min(map);
 1860         }
 1861         if (hint > vm_map_max(map)) {
 1862                 UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
 1863                     hint, vm_map_min(map), vm_map_max(map), 0);
 1864                 return (NULL);
 1865         }
 1866         orig_hint = hint;
 1867         INVARIANTS();
 1868 
 1869         UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
 1870             hint, vm_map_min(map), vm_map_max(map), 0);
 1871 
 1872         /*
 1873          * hint may not be aligned properly; we need round up or down it
 1874          * before proceeding further.
 1875          */
 1876         if ((flags & UVM_FLAG_COLORMATCH) == 0) {
 1877                 uvm_map_align_va(&hint, align, topdown);
 1878                 INVARIANTS();
 1879         }
 1880 
 1881         UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
 1882             hint, vm_map_min(map), vm_map_max(map), 0);
 1883         /*
 1884          * Look for the first possible address; if there's already
 1885          * something at this address, we have to start after it.
 1886          */
 1887 
 1888         /*
 1889          * @@@: there are four, no, eight cases to consider.
 1890          *
 1891          * 0: found,     fixed,     bottom up -> fail
 1892          * 1: found,     fixed,     top down  -> fail
 1893          * 2: found,     not fixed, bottom up -> start after entry->end,
 1894          *                                       loop up
 1895          * 3: found,     not fixed, top down  -> start before entry->start,
 1896          *                                       loop down
 1897          * 4: not found, fixed,     bottom up -> check entry->next->start, fail
 1898          * 5: not found, fixed,     top down  -> check entry->next->start, fail
 1899          * 6: not found, not fixed, bottom up -> check entry->next->start,
 1900          *                                       loop up
 1901          * 7: not found, not fixed, top down  -> check entry->next->start,
 1902          *                                       loop down
 1903          *
 1904          * as you can see, it reduces to roughly five cases, and that
 1905          * adding top down mapping only adds one unique case (without
 1906          * it, there would be four cases).
 1907          */
 1908 
 1909         if ((flags & UVM_FLAG_FIXED) == 0 &&
 1910             hint == (topdown ? vm_map_max(map) : vm_map_min(map))) {
 1911                 /*
 1912                  * The uvm_map_findspace algorithm is monotonic -- for
 1913                  * topdown VM it starts with a high hint and returns a
 1914                  * lower free address; for !topdown VM it starts with a
 1915                  * low hint and returns a higher free address.  As an
 1916                  * optimization, start with the first (highest for
 1917                  * topdown, lowest for !topdown) free address.
 1918                  *
 1919                  * XXX This `optimization' probably doesn't actually do
 1920                  * much in practice unless userland explicitly passes
 1921                  * the VM map's minimum or maximum address, which
 1922                  * varies from machine to machine (VM_MAX/MIN_ADDRESS,
 1923                  * e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on
 1924                  * aarch64) and may vary according to other factors
 1925                  * like sysctl vm.user_va0_disable.  In particular, if
 1926                  * the user specifies 0 as a hint to mmap, then mmap
 1927                  * will choose a default address which is usually _not_
 1928                  * VM_MAX/MIN_ADDRESS but something else instead like
 1929                  * VM_MAX_ADDRESS - stack size - guard page overhead,
 1930                  * in which case this branch is never hit.
 1931                  *
 1932                  * In fact, this branch appears to have been broken for
 1933                  * two decades between when topdown was introduced in
 1934                  * ~2003 and when it was adapted to handle the topdown
 1935                  * case without violating the monotonicity assertion in
 1936                  * 2022.  Maybe Someone^TM should either ditch the
 1937                  * optimization or find a better way to do it.
 1938                  */
 1939                 entry = map->first_free;
 1940         } else {
 1941                 if (uvm_map_lookup_entry(map, hint, &entry)) {
 1942                         /* "hint" address already in use ... */
 1943                         if (flags & UVM_FLAG_FIXED) {
 1944                                 UVMHIST_LOG(maphist, "<- fixed & VA in use",
 1945                                     0, 0, 0, 0);
 1946                                 return (NULL);
 1947                         }
 1948                         if (topdown)
 1949                                 /* Start from lower gap. */
 1950                                 entry = entry->prev;
 1951                 } else if (flags & UVM_FLAG_FIXED) {
 1952                         if (entry->next->start >= hint + length &&
 1953                             hint + length > hint)
 1954                                 goto found;
 1955 
 1956                         /* "hint" address is gap but too small */
 1957                         UVMHIST_LOG(maphist, "<- fixed mapping failed",
 1958                             0, 0, 0, 0);
 1959                         return (NULL); /* only one shot at it ... */
 1960                 } else {
 1961                         /*
 1962                          * See if given hint fits in this gap.
 1963                          */
 1964                         avail = uvm_map_space_avail(&hint, length,
 1965                             uoffset, align, flags, topdown, entry);
 1966                         INVARIANTS();
 1967                         switch (avail) {
 1968                         case 1:
 1969                                 goto found;
 1970                         case -1:
 1971                                 goto wraparound;
 1972                         }
 1973 
 1974                         if (topdown) {
 1975                                 /*
 1976                                  * Still there is a chance to fit
 1977                                  * if hint > entry->end.
 1978                                  */
 1979                         } else {
 1980                                 /* Start from higher gap. */
 1981                                 entry = entry->next;
 1982                                 if (entry == &map->header)
 1983                                         goto notfound;
 1984                                 goto nextgap;
 1985                         }
 1986                 }
 1987         }
 1988 
 1989         /*
 1990          * Note that all UVM_FLAGS_FIXED case is already handled.
 1991          */
 1992         KDASSERT((flags & UVM_FLAG_FIXED) == 0);
 1993 
 1994         /* Try to find the space in the red-black tree */
 1995 
 1996         /* Check slot before any entry */
 1997         if (topdown) {
 1998                 KASSERTMSG(entry->next->start >= vm_map_min(map),
 1999                     "map=%p entry=%p entry->next=%p"
 2000                     " entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR,
 2001                     map, entry, entry->next,
 2002                     entry->next->start, vm_map_min(map));
 2003                 if (length > entry->next->start - vm_map_min(map))
 2004                         hint = vm_map_min(map); /* XXX goto wraparound? */
 2005                 else
 2006                         hint = entry->next->start - length;
 2007                 KASSERT(hint >= vm_map_min(map));
 2008         } else {
 2009                 hint = entry->end;
 2010         }
 2011         INVARIANTS();
 2012         avail = uvm_map_space_avail(&hint, length, uoffset, align, flags,
 2013             topdown, entry);
 2014         INVARIANTS();
 2015         switch (avail) {
 2016         case 1:
 2017                 goto found;
 2018         case -1:
 2019                 goto wraparound;
 2020         }
 2021 
 2022 nextgap:
 2023         KDASSERT((flags & UVM_FLAG_FIXED) == 0);
 2024         /* If there is not enough space in the whole tree, we fail */
 2025         tmp = ROOT_ENTRY(map);
 2026         if (tmp == NULL || tmp->maxgap < length)
 2027                 goto notfound;
 2028 
 2029         prev = NULL; /* previous candidate */
 2030 
 2031         /* Find an entry close to hint that has enough space */
 2032         for (; tmp;) {
 2033                 KASSERT(tmp->next->start == tmp->end + tmp->gap);
 2034                 if (topdown) {
 2035                         if (tmp->next->start < hint + length &&
 2036                             (prev == NULL || tmp->end > prev->end)) {
 2037                                 if (tmp->gap >= length)
 2038                                         prev = tmp;
 2039                                 else if ((child = LEFT_ENTRY(tmp)) != NULL
 2040                                     && child->maxgap >= length)
 2041                                         prev = tmp;
 2042                         }
 2043                 } else {
 2044                         if (tmp->end >= hint &&
 2045                             (prev == NULL || tmp->end < prev->end)) {
 2046                                 if (tmp->gap >= length)
 2047                                         prev = tmp;
 2048                                 else if ((child = RIGHT_ENTRY(tmp)) != NULL
 2049                                     && child->maxgap >= length)
 2050                                         prev = tmp;
 2051                         }
 2052                 }
 2053                 if (tmp->next->start < hint + length)
 2054                         child = RIGHT_ENTRY(tmp);
 2055                 else if (tmp->end > hint)
 2056                         child = LEFT_ENTRY(tmp);
 2057                 else {
 2058                         if (tmp->gap >= length)
 2059                                 break;
 2060                         if (topdown)
 2061                                 child = LEFT_ENTRY(tmp);
 2062                         else
 2063                                 child = RIGHT_ENTRY(tmp);
 2064                 }
 2065                 if (child == NULL || child->maxgap < length)
 2066                         break;
 2067                 tmp = child;
 2068         }
 2069 
 2070         if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
 2071                 /*
 2072                  * Check if the entry that we found satifies the
 2073                  * space requirement
 2074                  */
 2075                 if (topdown) {
 2076                         if (hint > tmp->next->start - length)
 2077                                 hint = tmp->next->start - length;
 2078                 } else {
 2079                         if (hint < tmp->end)
 2080                                 hint = tmp->end;
 2081                 }
 2082                 INVARIANTS();
 2083                 avail = uvm_map_space_avail(&hint, length, uoffset, align,
 2084                     flags, topdown, tmp);
 2085                 INVARIANTS();
 2086                 switch (avail) {
 2087                 case 1:
 2088                         entry = tmp;
 2089                         goto found;
 2090                 case -1:
 2091                         goto wraparound;
 2092                 }
 2093                 if (tmp->gap >= length)
 2094                         goto listsearch;
 2095         }
 2096         if (prev == NULL)
 2097                 goto notfound;
 2098 
 2099         if (topdown) {
 2100                 KASSERT(orig_hint >= prev->next->start - length ||
 2101                     prev->next->start - length > prev->next->start);
 2102                 hint = prev->next->start - length;
 2103         } else {
 2104                 KASSERT(orig_hint <= prev->end);
 2105                 hint = prev->end;
 2106         }
 2107         INVARIANTS();
 2108         avail = uvm_map_space_avail(&hint, length, uoffset, align,
 2109             flags, topdown, prev);
 2110         INVARIANTS();
 2111         switch (avail) {
 2112         case 1:
 2113                 entry = prev;
 2114                 goto found;
 2115         case -1:
 2116                 goto wraparound;
 2117         }
 2118         if (prev->gap >= length)
 2119                 goto listsearch;
 2120 
 2121         if (topdown)
 2122                 tmp = LEFT_ENTRY(prev);
 2123         else
 2124                 tmp = RIGHT_ENTRY(prev);
 2125         for (;;) {
 2126                 KASSERT(tmp && tmp->maxgap >= length);
 2127                 if (topdown)
 2128                         child = RIGHT_ENTRY(tmp);
 2129                 else
 2130                         child = LEFT_ENTRY(tmp);
 2131                 if (child && child->maxgap >= length) {
 2132                         tmp = child;
 2133                         continue;
 2134                 }
 2135                 if (tmp->gap >= length)
 2136                         break;
 2137                 if (topdown)
 2138                         tmp = LEFT_ENTRY(tmp);
 2139                 else
 2140                         tmp = RIGHT_ENTRY(tmp);
 2141         }
 2142 
 2143         if (topdown) {
 2144                 KASSERT(orig_hint >= tmp->next->start - length ||
 2145                     tmp->next->start - length > tmp->next->start);
 2146                 hint = tmp->next->start - length;
 2147         } else {
 2148                 KASSERT(orig_hint <= tmp->end);
 2149                 hint = tmp->end;
 2150         }
 2151         INVARIANTS();
 2152         avail = uvm_map_space_avail(&hint, length, uoffset, align,
 2153             flags, topdown, tmp);
 2154         INVARIANTS();
 2155         switch (avail) {
 2156         case 1:
 2157                 entry = tmp;
 2158                 goto found;
 2159         case -1:
 2160                 goto wraparound;
 2161         }
 2162 
 2163         /*
 2164          * The tree fails to find an entry because of offset or alignment
 2165          * restrictions.  Search the list instead.
 2166          */
 2167  listsearch:
 2168         /*
 2169          * Look through the rest of the map, trying to fit a new region in
 2170          * the gap between existing regions, or after the very last region.
 2171          * note: entry->end = base VA of current gap,
 2172          *       entry->next->start = VA of end of current gap
 2173          */
 2174 
 2175         INVARIANTS();
 2176         for (;;) {
 2177                 /* Update hint for current gap. */
 2178                 hint = topdown ? entry->next->start - length : entry->end;
 2179                 INVARIANTS();
 2180 
 2181                 /* See if it fits. */
 2182                 avail = uvm_map_space_avail(&hint, length, uoffset, align,
 2183                     flags, topdown, entry);
 2184                 INVARIANTS();
 2185                 switch (avail) {
 2186                 case 1:
 2187                         goto found;
 2188                 case -1:
 2189                         goto wraparound;
 2190                 }
 2191 
 2192                 /* Advance to next/previous gap */
 2193                 if (topdown) {
 2194                         if (entry == &map->header) {
 2195                                 UVMHIST_LOG(maphist, "<- failed (off start)",
 2196                                     0,0,0,0);
 2197                                 goto notfound;
 2198                         }
 2199                         entry = entry->prev;
 2200                 } else {
 2201                         entry = entry->next;
 2202                         if (entry == &map->header) {
 2203                                 UVMHIST_LOG(maphist, "<- failed (off end)",
 2204                                     0,0,0,0);
 2205                                 goto notfound;
 2206                         }
 2207                 }
 2208         }
 2209 
 2210  found:
 2211         SAVE_HINT(map, map->hint, entry);
 2212         *result = hint;
 2213         UVMHIST_LOG(maphist,"<- got it!  (result=%#jx)", hint, 0,0,0);
 2214         INVARIANTS();
 2215         KASSERT(entry->end <= hint);
 2216         KASSERT(hint + length <= entry->next->start);
 2217         return (entry);
 2218 
 2219  wraparound:
 2220         UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
 2221 
 2222         return (NULL);
 2223 
 2224  notfound:
 2225         UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);
 2226 
 2227         return (NULL);
 2228 #undef INVARIANTS
 2229 }
 2230 
 2231 /*
 2232  *   U N M A P   -   m a i n   h e l p e r   f u n c t i o n s
 2233  */
 2234 
 2235 /*
 2236  * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
 2237  *
 2238  * => caller must check alignment and size
 2239  * => map must be locked by caller
 2240  * => we return a list of map entries that we've remove from the map
 2241  *    in "entry_list"
 2242  */
 2243 
 2244 void
 2245 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
 2246     struct vm_map_entry **entry_list /* OUT */, int flags)
 2247 {
 2248         struct vm_map_entry *entry, *first_entry, *next;
 2249         vaddr_t len;
 2250         UVMHIST_FUNC(__func__);
 2251         UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
 2252             (uintptr_t)map, start, end, 0);
 2253         VM_MAP_RANGE_CHECK(map, start, end);
 2254 
 2255         uvm_map_check(map, "unmap_remove entry");
 2256 
 2257         /*
 2258          * find first entry
 2259          */
 2260 
 2261         if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
 2262                 /* clip and go... */
 2263                 entry = first_entry;
 2264                 UVM_MAP_CLIP_START(map, entry, start);
 2265                 /* critical!  prevents stale hint */
 2266                 SAVE_HINT(map, entry, entry->prev);
 2267         } else {
 2268                 entry = first_entry->next;
 2269         }
 2270 
 2271         /*
 2272          * save the free space hint
 2273          */
 2274 
 2275         if (map->first_free != &map->header && map->first_free->start >= start)
 2276                 map->first_free = entry->prev;
 2277 
 2278         /*
 2279          * note: we now re-use first_entry for a different task.  we remove
 2280          * a number of map entries from the map and save them in a linked
 2281          * list headed by "first_entry".  once we remove them from the map
 2282          * the caller should unlock the map and drop the references to the
 2283          * backing objects [c.f. uvm_unmap_detach].  the object is to
 2284          * separate unmapping from reference dropping.  why?
 2285          *   [1] the map has to be locked for unmapping
 2286          *   [2] the map need not be locked for reference dropping
 2287          *   [3] dropping references may trigger pager I/O, and if we hit
 2288          *       a pager that does synchronous I/O we may have to wait for it.
 2289          *   [4] we would like all waiting for I/O to occur with maps unlocked
 2290          *       so that we don't block other threads.
 2291          */
 2292 
 2293         first_entry = NULL;
 2294         *entry_list = NULL;
 2295 
 2296         /*
 2297          * break up the area into map entry sized regions and unmap.  note
 2298          * that all mappings have to be removed before we can even consider
 2299          * dropping references to amaps or VM objects (otherwise we could end
 2300          * up with a mapping to a page on the free list which would be very bad)
 2301          */
 2302 
 2303         while ((entry != &map->header) && (entry->start < end)) {
 2304                 KASSERT((entry->flags & UVM_MAP_STATIC) == 0);
 2305 
 2306                 UVM_MAP_CLIP_END(map, entry, end);
 2307                 next = entry->next;
 2308                 len = entry->end - entry->start;
 2309 
 2310                 /*
 2311                  * unwire before removing addresses from the pmap; otherwise
 2312                  * unwiring will put the entries back into the pmap (XXX).
 2313                  */
 2314 
 2315                 if (VM_MAPENT_ISWIRED(entry)) {
 2316                         uvm_map_entry_unwire(map, entry);
 2317                 }
 2318                 if (flags & UVM_FLAG_VAONLY) {
 2319 
 2320                         /* nothing */
 2321 
 2322                 } else if ((map->flags & VM_MAP_PAGEABLE) == 0) {
 2323 
 2324                         /*
 2325                          * if the map is non-pageable, any pages mapped there
 2326                          * must be wired and entered with pmap_kenter_pa(),
 2327                          * and we should free any such pages immediately.
 2328                          * this is mostly used for kmem_map.
 2329                          */
 2330                         KASSERT(vm_map_pmap(map) == pmap_kernel());
 2331 
 2332                         uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
 2333                 } else if (UVM_ET_ISOBJ(entry) &&
 2334                            UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
 2335                         panic("%s: kernel object %p %p\n",
 2336                             __func__, map, entry);
 2337                 } else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
 2338                         /*
 2339                          * remove mappings the standard way.  lock object
 2340                          * and/or amap to ensure vm_page state does not
 2341                          * change while in pmap_remove().
 2342                          */
 2343 
 2344 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
 2345                         uvm_map_lock_entry(entry, RW_WRITER);
 2346 #else
 2347                         uvm_map_lock_entry(entry, RW_READER);
 2348 #endif
 2349                         pmap_remove(map->pmap, entry->start, entry->end);
 2350 
 2351                         /*
 2352                          * note: if map is dying, leave pmap_update() for
 2353                          * later.  if the map is to be reused (exec) then
 2354                          * pmap_update() will be called.  if the map is
 2355                          * being disposed of (exit) then pmap_destroy()
 2356                          * will be called.
 2357                          */
 2358 
 2359                         if ((map->flags & VM_MAP_DYING) == 0) {
 2360                                 pmap_update(vm_map_pmap(map));
 2361                         } else {
 2362                                 KASSERT(vm_map_pmap(map) != pmap_kernel());
 2363                         }
 2364 
 2365                         uvm_map_unlock_entry(entry);
 2366                 }
 2367 
 2368 #if defined(UVMDEBUG)
 2369                 /*
 2370                  * check if there's remaining mapping,
 2371                  * which is a bug in caller.
 2372                  */
 2373 
 2374                 vaddr_t va;
 2375                 for (va = entry->start; va < entry->end;
 2376                     va += PAGE_SIZE) {
 2377                         if (pmap_extract(vm_map_pmap(map), va, NULL)) {
 2378                                 panic("%s: %#"PRIxVADDR" has mapping",
 2379                                     __func__, va);
 2380                         }
 2381                 }
 2382 
 2383                 if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
 2384                         uvm_km_check_empty(map, entry->start,
 2385                             entry->end);
 2386                 }
 2387 #endif /* defined(UVMDEBUG) */
 2388 
 2389                 /*
 2390                  * remove entry from map and put it on our list of entries
 2391                  * that we've nuked.  then go to next entry.
 2392                  */
 2393 
 2394                 UVMHIST_LOG(maphist, "  removed map entry %#jx",
 2395                     (uintptr_t)entry, 0, 0, 0);
 2396 
 2397                 /* critical!  prevents stale hint */
 2398                 SAVE_HINT(map, entry, entry->prev);
 2399 
 2400                 uvm_map_entry_unlink(map, entry);
 2401                 KASSERT(map->size >= len);
 2402                 map->size -= len;
 2403                 entry->prev = NULL;
 2404                 entry->next = first_entry;
 2405                 first_entry = entry;
 2406                 entry = next;
 2407         }
 2408 
 2409         uvm_map_check(map, "unmap_remove leave");
 2410 
 2411         /*
 2412          * now we've cleaned up the map and are ready for the caller to drop
 2413          * references to the mapped objects.
 2414          */
 2415 
 2416         *entry_list = first_entry;
 2417         UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
 2418 
 2419         if (map->flags & VM_MAP_WANTVA) {
 2420                 mutex_enter(&map->misc_lock);
 2421                 map->flags &= ~VM_MAP_WANTVA;
 2422                 cv_broadcast(&map->cv);
 2423                 mutex_exit(&map->misc_lock);
 2424         }
 2425 }
 2426 
 2427 /*
 2428  * uvm_unmap_detach: drop references in a chain of map entries
 2429  *
 2430  * => we will free the map entries as we traverse the list.
 2431  */
 2432 
 2433 void
 2434 uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
 2435 {
 2436         struct vm_map_entry *next_entry;
 2437         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 2438 
 2439         while (first_entry) {
 2440                 KASSERT(!VM_MAPENT_ISWIRED(first_entry));
 2441                 UVMHIST_LOG(maphist,
 2442                     "  detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
 2443                     (uintptr_t)first_entry,
 2444                     (uintptr_t)first_entry->aref.ar_amap,
 2445                     (uintptr_t)first_entry->object.uvm_obj,
 2446                     UVM_ET_ISSUBMAP(first_entry));
 2447 
 2448                 /*
 2449                  * drop reference to amap, if we've got one
 2450                  */
 2451 
 2452                 if (first_entry->aref.ar_amap)
 2453                         uvm_map_unreference_amap(first_entry, flags);
 2454 
 2455                 /*
 2456                  * drop reference to our backing object, if we've got one
 2457                  */
 2458 
 2459                 KASSERT(!UVM_ET_ISSUBMAP(first_entry));
 2460                 if (UVM_ET_ISOBJ(first_entry) &&
 2461                     first_entry->object.uvm_obj->pgops->pgo_detach) {
 2462                         (*first_entry->object.uvm_obj->pgops->pgo_detach)
 2463                                 (first_entry->object.uvm_obj);
 2464                 }
 2465                 next_entry = first_entry->next;
 2466                 uvm_mapent_free(first_entry);
 2467                 first_entry = next_entry;
 2468         }
 2469         UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
 2470 }
 2471 
 2472 /*
 2473  *   E X T R A C T I O N   F U N C T I O N S
 2474  */
 2475 
 2476 /*
 2477  * uvm_map_reserve: reserve space in a vm_map for future use.
 2478  *
 2479  * => we reserve space in a map by putting a dummy map entry in the
 2480  *    map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
 2481  * => map should be unlocked (we will write lock it)
 2482  * => we return true if we were able to reserve space
 2483  * => XXXCDC: should be inline?
 2484  */
 2485 
 2486 int
 2487 uvm_map_reserve(struct vm_map *map, vsize_t size,
 2488     vaddr_t offset      /* hint for pmap_prefer */,
 2489     vsize_t align       /* alignment */,
 2490     vaddr_t *raddr      /* IN:hint, OUT: reserved VA */,
 2491     uvm_flag_t flags    /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
 2492 {
 2493         UVMHIST_FUNC(__func__);
 2494         UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
 2495             (uintptr_t)map, size, offset, (uintptr_t)raddr);
 2496 
 2497         size = round_page(size);
 2498 
 2499         /*
 2500          * reserve some virtual space.
 2501          */
 2502 
 2503         if (uvm_map(map, raddr, size, NULL, offset, align,
 2504             UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
 2505             UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
 2506             UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
 2507                 return (false);
 2508         }
 2509 
 2510         UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
 2511         return (true);
 2512 }
 2513 
 2514 /*
 2515  * uvm_map_replace: replace a reserved (blank) area of memory with
 2516  * real mappings.
 2517  *
 2518  * => caller must WRITE-LOCK the map
 2519  * => we return true if replacement was a success
 2520  * => we expect the newents chain to have nnewents entrys on it and
 2521  *    we expect newents->prev to point to the last entry on the list
 2522  * => note newents is allowed to be NULL
 2523  */
 2524 
 2525 static int
 2526 uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
 2527     struct vm_map_entry *newents, int nnewents, vsize_t nsize,
 2528     struct vm_map_entry **oldentryp)
 2529 {
 2530         struct vm_map_entry *oldent, *last;
 2531 
 2532         uvm_map_check(map, "map_replace entry");
 2533 
 2534         /*
 2535          * first find the blank map entry at the specified address
 2536          */
 2537 
 2538         if (!uvm_map_lookup_entry(map, start, &oldent)) {
 2539                 return (false);
 2540         }
 2541 
 2542         /*
 2543          * check to make sure we have a proper blank entry
 2544          */
 2545 
 2546         if (end < oldent->end) {
 2547                 UVM_MAP_CLIP_END(map, oldent, end);
 2548         }
 2549         if (oldent->start != start || oldent->end != end ||
 2550             oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
 2551                 return (false);
 2552         }
 2553 
 2554 #ifdef DIAGNOSTIC
 2555 
 2556         /*
 2557          * sanity check the newents chain
 2558          */
 2559 
 2560         {
 2561                 struct vm_map_entry *tmpent = newents;
 2562                 int nent = 0;
 2563                 vsize_t sz = 0;
 2564                 vaddr_t cur = start;
 2565 
 2566                 while (tmpent) {
 2567                         nent++;
 2568                         sz += tmpent->end - tmpent->start;
 2569                         if (tmpent->start < cur)
 2570                                 panic("uvm_map_replace1");
 2571                         if (tmpent->start >= tmpent->end || tmpent->end > end) {
 2572                                 panic("uvm_map_replace2: "
 2573                                     "tmpent->start=%#"PRIxVADDR
 2574                                     ", tmpent->end=%#"PRIxVADDR
 2575                                     ", end=%#"PRIxVADDR,
 2576                                     tmpent->start, tmpent->end, end);
 2577                         }
 2578                         cur = tmpent->end;
 2579                         if (tmpent->next) {
 2580                                 if (tmpent->next->prev != tmpent)
 2581                                         panic("uvm_map_replace3");
 2582                         } else {
 2583                                 if (newents->prev != tmpent)
 2584                                         panic("uvm_map_replace4");
 2585                         }
 2586                         tmpent = tmpent->next;
 2587                 }
 2588                 if (nent != nnewents)
 2589                         panic("uvm_map_replace5");
 2590                 if (sz != nsize)
 2591                         panic("uvm_map_replace6");
 2592         }
 2593 #endif
 2594 
 2595         /*
 2596          * map entry is a valid blank!   replace it.   (this does all the
 2597          * work of map entry link/unlink...).
 2598          */
 2599 
 2600         if (newents) {
 2601                 last = newents->prev;
 2602 
 2603                 /* critical: flush stale hints out of map */
 2604                 SAVE_HINT(map, map->hint, newents);
 2605                 if (map->first_free == oldent)
 2606                         map->first_free = last;
 2607 
 2608                 last->next = oldent->next;
 2609                 last->next->prev = last;
 2610 
 2611                 /* Fix RB tree */
 2612                 uvm_rb_remove(map, oldent);
 2613 
 2614                 newents->prev = oldent->prev;
 2615                 newents->prev->next = newents;
 2616                 map->nentries = map->nentries + (nnewents - 1);
 2617 
 2618                 /* Fixup the RB tree */
 2619                 {
 2620                         int i;
 2621                         struct vm_map_entry *tmp;
 2622 
 2623                         tmp = newents;
 2624                         for (i = 0; i < nnewents && tmp; i++) {
 2625                                 uvm_rb_insert(map, tmp);
 2626                                 tmp = tmp->next;
 2627                         }
 2628                 }
 2629         } else {
 2630                 /* NULL list of new entries: just remove the old one */
 2631                 clear_hints(map, oldent);
 2632                 uvm_map_entry_unlink(map, oldent);
 2633         }
 2634         map->size -= end - start - nsize;
 2635 
 2636         uvm_map_check(map, "map_replace leave");
 2637 
 2638         /*
 2639          * now we can free the old blank entry and return.
 2640          */
 2641 
 2642         *oldentryp = oldent;
 2643         return (true);
 2644 }
 2645 
 2646 /*
 2647  * uvm_map_extract: extract a mapping from a map and put it somewhere
 2648  *      (maybe removing the old mapping)
 2649  *
 2650  * => maps should be unlocked (we will write lock them)
 2651  * => returns 0 on success, error code otherwise
 2652  * => start must be page aligned
 2653  * => len must be page sized
 2654  * => flags:
 2655  *      UVM_EXTRACT_REMOVE: remove mappings from srcmap
 2656  *      UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
 2657  *      UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
 2658  *      UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
 2659  *      UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
 2660  *    >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
 2661  *    >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
 2662  *             be used from within the kernel in a kernel level map <<<
 2663  */
 2664 
 2665 int
 2666 uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
 2667     struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
 2668 {
 2669         vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
 2670         struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
 2671             *deadentry, *oldentry;
 2672         struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
 2673         vsize_t elen __unused;
 2674         int nchain, error, copy_ok;
 2675         vsize_t nsize;
 2676         UVMHIST_FUNC(__func__);
 2677         UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
 2678             (uintptr_t)srcmap, start, len, 0);
 2679         UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
 2680             (uintptr_t)dstmap, flags, 0, 0);
 2681 
 2682         /*
 2683          * step 0: sanity check: start must be on a page boundary, length
 2684          * must be page sized.  can't ask for CONTIG/QREF if you asked for
 2685          * REMOVE.
 2686          */
 2687 
 2688         KASSERT((start & PAGE_MASK) == 0 && (len & PAGE_MASK) == 0);
 2689         KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
 2690                 (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
 2691 
 2692         /*
 2693          * step 1: reserve space in the target map for the extracted area
 2694          */
 2695 
 2696         if ((flags & UVM_EXTRACT_RESERVED) == 0) {
 2697                 dstaddr = vm_map_min(dstmap);
 2698                 if (!uvm_map_reserve(dstmap, len, start,
 2699                     atop(start) & uvmexp.colormask, &dstaddr,
 2700                     UVM_FLAG_COLORMATCH))
 2701                         return (ENOMEM);
 2702                 KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
 2703                 *dstaddrp = dstaddr;    /* pass address back to caller */
 2704                 UVMHIST_LOG(maphist, "  dstaddr=%#jx", dstaddr,0,0,0);
 2705         } else {
 2706                 dstaddr = *dstaddrp;
 2707         }
 2708 
 2709         /*
 2710          * step 2: setup for the extraction process loop by init'ing the
 2711          * map entry chain, locking src map, and looking up the first useful
 2712          * entry in the map.
 2713          */
 2714 
 2715         end = start + len;
 2716         newend = dstaddr + len;
 2717         chain = endchain = NULL;
 2718         nchain = 0;
 2719         nsize = 0;
 2720         vm_map_lock(srcmap);
 2721 
 2722         if (uvm_map_lookup_entry(srcmap, start, &entry)) {
 2723 
 2724                 /* "start" is within an entry */
 2725                 if (flags & UVM_EXTRACT_QREF) {
 2726 
 2727                         /*
 2728                          * for quick references we don't clip the entry, so
 2729                          * the entry may map space "before" the starting
 2730                          * virtual address... this is the "fudge" factor
 2731                          * (which can be non-zero only the first time
 2732                          * through the "while" loop in step 3).
 2733                          */
 2734 
 2735                         fudge = start - entry->start;
 2736                 } else {
 2737 
 2738                         /*
 2739                          * normal reference: we clip the map to fit (thus
 2740                          * fudge is zero)
 2741                          */
 2742 
 2743                         UVM_MAP_CLIP_START(srcmap, entry, start);
 2744                         SAVE_HINT(srcmap, srcmap->hint, entry->prev);
 2745                         fudge = 0;
 2746                 }
 2747         } else {
 2748 
 2749                 /* "start" is not within an entry ... skip to next entry */
 2750                 if (flags & UVM_EXTRACT_CONTIG) {
 2751                         error = EINVAL;
 2752                         goto bad;    /* definite hole here ... */
 2753                 }
 2754 
 2755                 entry = entry->next;
 2756                 fudge = 0;
 2757         }
 2758 
 2759         /* save values from srcmap for step 6 */
 2760         orig_entry = entry;
 2761         orig_fudge = fudge;
 2762 
 2763         /*
 2764          * step 3: now start looping through the map entries, extracting
 2765          * as we go.
 2766          */
 2767 
 2768         while (entry->start < end && entry != &srcmap->header) {
 2769 
 2770                 /* if we are not doing a quick reference, clip it */
 2771                 if ((flags & UVM_EXTRACT_QREF) == 0)
 2772                         UVM_MAP_CLIP_END(srcmap, entry, end);
 2773 
 2774                 /* clear needs_copy (allow chunking) */
 2775                 if (UVM_ET_ISNEEDSCOPY(entry)) {
 2776                         amap_copy(srcmap, entry,
 2777                             AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
 2778                         if (UVM_ET_ISNEEDSCOPY(entry)) {  /* failed? */
 2779                                 error = ENOMEM;
 2780                                 goto bad;
 2781                         }
 2782 
 2783                         /* amap_copy could clip (during chunk)!  update fudge */
 2784                         if (fudge) {
 2785                                 fudge = start - entry->start;
 2786                                 orig_fudge = fudge;
 2787                         }
 2788                 }
 2789 
 2790                 /* calculate the offset of this from "start" */
 2791                 oldoffset = (entry->start + fudge) - start;
 2792 
 2793                 /* allocate a new map entry */
 2794                 newentry = uvm_mapent_alloc(dstmap, 0);
 2795                 if (newentry == NULL) {
 2796                         error = ENOMEM;
 2797                         goto bad;
 2798                 }
 2799 
 2800                 /* set up new map entry */
 2801                 newentry->next = NULL;
 2802                 newentry->prev = endchain;
 2803                 newentry->start = dstaddr + oldoffset;
 2804                 newentry->end =
 2805                     newentry->start + (entry->end - (entry->start + fudge));
 2806                 if (newentry->end > newend || newentry->end < newentry->start)
 2807                         newentry->end = newend;
 2808                 newentry->object.uvm_obj = entry->object.uvm_obj;
 2809                 if (newentry->object.uvm_obj) {
 2810                         if (newentry->object.uvm_obj->pgops->pgo_reference)
 2811                                 newentry->object.uvm_obj->pgops->
 2812                                     pgo_reference(newentry->object.uvm_obj);
 2813                         newentry->offset = entry->offset + fudge;
 2814                 } else {
 2815                         newentry->offset = 0;
 2816                 }
 2817                 newentry->etype = entry->etype;
 2818                 if (flags & UVM_EXTRACT_PROT_ALL) {
 2819                         newentry->protection = newentry->max_protection =
 2820                             UVM_PROT_ALL;
 2821                 } else {
 2822                         newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
 2823                             entry->max_protection : entry->protection;
 2824                         newentry->max_protection = entry->max_protection;
 2825                 }
 2826                 newentry->inheritance = entry->inheritance;
 2827                 newentry->wired_count = 0;
 2828                 newentry->aref.ar_amap = entry->aref.ar_amap;
 2829                 if (newentry->aref.ar_amap) {
 2830                         newentry->aref.ar_pageoff =
 2831                             entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
 2832                         uvm_map_reference_amap(newentry, AMAP_SHARED |
 2833                             ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
 2834                 } else {
 2835                         newentry->aref.ar_pageoff = 0;
 2836                 }
 2837                 newentry->advice = entry->advice;
 2838                 if ((flags & UVM_EXTRACT_QREF) != 0) {
 2839                         newentry->flags |= UVM_MAP_NOMERGE;
 2840                 }
 2841 
 2842                 /* now link it on the chain */
 2843                 nchain++;
 2844                 nsize += newentry->end - newentry->start;
 2845                 if (endchain == NULL) {
 2846                         chain = endchain = newentry;
 2847                 } else {
 2848                         endchain->next = newentry;
 2849                         endchain = newentry;
 2850                 }
 2851 
 2852                 /* end of 'while' loop! */
 2853                 if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end &&
 2854                     (entry->next == &srcmap->header ||
 2855                     entry->next->start != entry->end)) {
 2856                         error = EINVAL;
 2857                         goto bad;
 2858                 }
 2859                 entry = entry->next;
 2860                 fudge = 0;
 2861         }
 2862 
 2863         /*
 2864          * step 4: close off chain (in format expected by uvm_map_replace)
 2865          */
 2866 
 2867         if (chain)
 2868                 chain->prev = endchain;
 2869 
 2870         /*
 2871          * step 5: attempt to lock the dest map so we can pmap_copy.
 2872          * note usage of copy_ok:
 2873          *   1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
 2874          *   0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
 2875          */
 2876 
 2877         if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
 2878                 copy_ok = 1;
 2879                 if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
 2880                     nchain, nsize, &resentry)) {
 2881                         if (srcmap != dstmap)
 2882                                 vm_map_unlock(dstmap);
 2883                         error = EIO;
 2884                         goto bad;
 2885                 }
 2886         } else {
 2887                 copy_ok = 0;
 2888                 /* replace defered until step 7 */
 2889         }
 2890 
 2891         /*
 2892          * step 6: traverse the srcmap a second time to do the following:
 2893          *  - if we got a lock on the dstmap do pmap_copy
 2894          *  - if UVM_EXTRACT_REMOVE remove the entries
 2895          * we make use of orig_entry and orig_fudge (saved in step 2)
 2896          */
 2897 
 2898         if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
 2899 
 2900                 /* purge possible stale hints from srcmap */
 2901                 if (flags & UVM_EXTRACT_REMOVE) {
 2902                         SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
 2903                         if (srcmap->first_free != &srcmap->header &&
 2904                             srcmap->first_free->start >= start)
 2905                                 srcmap->first_free = orig_entry->prev;
 2906                 }
 2907 
 2908                 entry = orig_entry;
 2909                 fudge = orig_fudge;
 2910                 deadentry = NULL;       /* for UVM_EXTRACT_REMOVE */
 2911 
 2912                 while (entry->start < end && entry != &srcmap->header) {
 2913                         if (copy_ok) {
 2914                                 oldoffset = (entry->start + fudge) - start;
 2915                                 elen = MIN(end, entry->end) -
 2916                                     (entry->start + fudge);
 2917                                 pmap_copy(dstmap->pmap, srcmap->pmap,
 2918                                     dstaddr + oldoffset, elen,
 2919                                     entry->start + fudge);
 2920                         }
 2921 
 2922                         /* we advance "entry" in the following if statement */
 2923                         if (flags & UVM_EXTRACT_REMOVE) {
 2924 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
 2925                                 uvm_map_lock_entry(entry, RW_WRITER);
 2926 #else
 2927                                 uvm_map_lock_entry(entry, RW_READER);
 2928 #endif
 2929                                 pmap_remove(srcmap->pmap, entry->start,
 2930                                                 entry->end);
 2931                                 uvm_map_unlock_entry(entry);
 2932                                 oldentry = entry;       /* save entry */
 2933                                 entry = entry->next;    /* advance */
 2934                                 uvm_map_entry_unlink(srcmap, oldentry);
 2935                                                         /* add to dead list */
 2936                                 oldentry->next = deadentry;
 2937                                 deadentry = oldentry;
 2938                         } else {
 2939                                 entry = entry->next;            /* advance */
 2940                         }
 2941 
 2942                         /* end of 'while' loop */
 2943                         fudge = 0;
 2944                 }
 2945                 pmap_update(srcmap->pmap);
 2946 
 2947                 /*
 2948                  * unlock dstmap.  we will dispose of deadentry in
 2949                  * step 7 if needed
 2950                  */
 2951 
 2952                 if (copy_ok && srcmap != dstmap)
 2953                         vm_map_unlock(dstmap);
 2954 
 2955         } else {
 2956                 deadentry = NULL;
 2957         }
 2958 
 2959         /*
 2960          * step 7: we are done with the source map, unlock.   if copy_ok
 2961          * is 0 then we have not replaced the dummy mapping in dstmap yet
 2962          * and we need to do so now.
 2963          */
 2964 
 2965         vm_map_unlock(srcmap);
 2966         if ((flags & UVM_EXTRACT_REMOVE) && deadentry)
 2967                 uvm_unmap_detach(deadentry, 0);   /* dispose of old entries */
 2968 
 2969         /* now do the replacement if we didn't do it in step 5 */
 2970         if (copy_ok == 0) {
 2971                 vm_map_lock(dstmap);
 2972                 error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
 2973                     nchain, nsize, &resentry);
 2974                 vm_map_unlock(dstmap);
 2975 
 2976                 if (error == false) {
 2977                         error = EIO;
 2978                         goto bad2;
 2979                 }
 2980         }
 2981 
 2982         if (resentry != NULL)
 2983                 uvm_mapent_free(resentry);
 2984 
 2985         return (0);
 2986 
 2987         /*
 2988          * bad: failure recovery
 2989          */
 2990 bad:
 2991         vm_map_unlock(srcmap);
 2992 bad2:                   /* src already unlocked */
 2993         if (chain)
 2994                 uvm_unmap_detach(chain,
 2995                     (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0);
 2996 
 2997         if (resentry != NULL)
 2998                 uvm_mapent_free(resentry);
 2999 
 3000         if ((flags & UVM_EXTRACT_RESERVED) == 0) {
 3001                 uvm_unmap(dstmap, dstaddr, dstaddr+len);   /* ??? */
 3002         }
 3003         return (error);
 3004 }
 3005 
 3006 /* end of extraction functions */
 3007 
 3008 /*
 3009  * uvm_map_submap: punch down part of a map into a submap
 3010  *
 3011  * => only the kernel_map is allowed to be submapped
 3012  * => the purpose of submapping is to break up the locking granularity
 3013  *      of a larger map
 3014  * => the range specified must have been mapped previously with a uvm_map()
 3015  *      call [with uobj==NULL] to create a blank map entry in the main map.
 3016  *      [And it had better still be blank!]
 3017  * => maps which contain submaps should never be copied or forked.
 3018  * => to remove a submap, use uvm_unmap() on the main map
 3019  *      and then uvm_map_deallocate() the submap.
 3020  * => main map must be unlocked.
 3021  * => submap must have been init'd and have a zero reference count.
 3022  *      [need not be locked as we don't actually reference it]
 3023  */
 3024 
 3025 int
 3026 uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
 3027     struct vm_map *submap)
 3028 {
 3029         struct vm_map_entry *entry;
 3030         int error;
 3031 
 3032         vm_map_lock(map);
 3033         VM_MAP_RANGE_CHECK(map, start, end);
 3034 
 3035         if (uvm_map_lookup_entry(map, start, &entry)) {
 3036                 UVM_MAP_CLIP_START(map, entry, start);
 3037                 UVM_MAP_CLIP_END(map, entry, end);      /* to be safe */
 3038         } else {
 3039                 entry = NULL;
 3040         }
 3041 
 3042         if (entry != NULL &&
 3043             entry->start == start && entry->end == end &&
 3044             entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
 3045             !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
 3046                 entry->etype |= UVM_ET_SUBMAP;
 3047                 entry->object.sub_map = submap;
 3048                 entry->offset = 0;
 3049                 uvm_map_reference(submap);
 3050                 error = 0;
 3051         } else {
 3052                 error = EINVAL;
 3053         }
 3054         vm_map_unlock(map);
 3055 
 3056         return error;
 3057 }
 3058 
 3059 /*
 3060  * uvm_map_protect_user: change map protection on behalf of the user.
 3061  * Enforces PAX settings as necessary.
 3062  */
 3063 int
 3064 uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
 3065     vm_prot_t new_prot)
 3066 {
 3067         int error;
 3068 
 3069         if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
 3070                 return error;
 3071 
 3072         return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
 3073             new_prot, false);
 3074 }
 3075 
 3076 
 3077 /*
 3078  * uvm_map_protect: change map protection
 3079  *
 3080  * => set_max means set max_protection.
 3081  * => map must be unlocked.
 3082  */
 3083 
 3084 #define MASK(entry)     (UVM_ET_ISCOPYONWRITE(entry) ? \
 3085                          ~VM_PROT_WRITE : VM_PROT_ALL)
 3086 
 3087 int
 3088 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
 3089     vm_prot_t new_prot, bool set_max)
 3090 {
 3091         struct vm_map_entry *current, *entry;
 3092         int error = 0;
 3093         UVMHIST_FUNC(__func__);
 3094         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
 3095             (uintptr_t)map, start, end, new_prot);
 3096 
 3097         vm_map_lock(map);
 3098         VM_MAP_RANGE_CHECK(map, start, end);
 3099         if (uvm_map_lookup_entry(map, start, &entry)) {
 3100                 UVM_MAP_CLIP_START(map, entry, start);
 3101         } else {
 3102                 entry = entry->next;
 3103         }
 3104 
 3105         /*
 3106          * make a first pass to check for protection violations.
 3107          */
 3108 
 3109         current = entry;
 3110         while ((current != &map->header) && (current->start < end)) {
 3111                 if (UVM_ET_ISSUBMAP(current)) {
 3112                         error = EINVAL;
 3113                         goto out;
 3114                 }
 3115                 if ((new_prot & current->max_protection) != new_prot) {
 3116                         error = EACCES;
 3117                         goto out;
 3118                 }
 3119                 /*
 3120                  * Don't allow VM_PROT_EXECUTE to be set on entries that
 3121                  * point to vnodes that are associated with a NOEXEC file
 3122                  * system.
 3123                  */
 3124                 if (UVM_ET_ISOBJ(current) &&
 3125                     UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
 3126                         struct vnode *vp =
 3127                             (struct vnode *) current->object.uvm_obj;
 3128 
 3129                         if ((new_prot & VM_PROT_EXECUTE) != 0 &&
 3130                             (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
 3131                                 error = EACCES;
 3132                                 goto out;
 3133                         }
 3134                 }
 3135 
 3136                 current = current->next;
 3137         }
 3138 
 3139         /* go back and fix up protections (no need to clip this time). */
 3140 
 3141         current = entry;
 3142         while ((current != &map->header) && (current->start < end)) {
 3143                 vm_prot_t old_prot;
 3144 
 3145                 UVM_MAP_CLIP_END(map, current, end);
 3146                 old_prot = current->protection;
 3147                 if (set_max)
 3148                         current->protection =
 3149                             (current->max_protection = new_prot) & old_prot;
 3150                 else
 3151                         current->protection = new_prot;
 3152 
 3153                 /*
 3154                  * update physical map if necessary.  worry about copy-on-write
 3155                  * here -- CHECK THIS XXX
 3156                  */
 3157 
 3158                 if (current->protection != old_prot) {
 3159                         /* update pmap! */
 3160 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
 3161                         uvm_map_lock_entry(current, RW_WRITER);
 3162 #else
 3163                         uvm_map_lock_entry(current, RW_READER);
 3164 #endif
 3165                         pmap_protect(map->pmap, current->start, current->end,
 3166                             current->protection & MASK(current));
 3167                         uvm_map_unlock_entry(current);
 3168 
 3169                         /*
 3170                          * If this entry points at a vnode, and the
 3171                          * protection includes VM_PROT_EXECUTE, mark
 3172                          * the vnode as VEXECMAP.
 3173                          */
 3174                         if (UVM_ET_ISOBJ(current)) {
 3175                                 struct uvm_object *uobj =
 3176                                     current->object.uvm_obj;
 3177 
 3178                                 if (UVM_OBJ_IS_VNODE(uobj) &&
 3179                                     (current->protection & VM_PROT_EXECUTE)) {
 3180                                         vn_markexec((struct vnode *) uobj);
 3181                                 }
 3182                         }
 3183                 }
 3184 
 3185                 /*
 3186                  * If the map is configured to lock any future mappings,
 3187                  * wire this entry now if the old protection was VM_PROT_NONE
 3188                  * and the new protection is not VM_PROT_NONE.
 3189                  */
 3190 
 3191                 if ((map->flags & VM_MAP_WIREFUTURE) != 0 &&
 3192                     VM_MAPENT_ISWIRED(current) == 0 &&
 3193                     old_prot == VM_PROT_NONE &&
 3194                     new_prot != VM_PROT_NONE) {
 3195 
 3196                         /*
 3197                          * We must call pmap_update() here because the
 3198                          * pmap_protect() call above might have removed some
 3199                          * pmap entries and uvm_map_pageable() might create
 3200                          * some new pmap entries that rely on the prior
 3201                          * removals being completely finished.
 3202                          */
 3203 
 3204                         pmap_update(map->pmap);
 3205 
 3206                         if (uvm_map_pageable(map, current->start,
 3207                             current->end, false,
 3208                             UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
 3209 
 3210                                 /*
 3211                                  * If locking the entry fails, remember the
 3212                                  * error if it's the first one.  Note we
 3213                                  * still continue setting the protection in
 3214                                  * the map, but will return the error
 3215                                  * condition regardless.
 3216                                  *
 3217                                  * XXX Ignore what the actual error is,
 3218                                  * XXX just call it a resource shortage
 3219                                  * XXX so that it doesn't get confused
 3220                                  * XXX what uvm_map_protect() itself would
 3221                                  * XXX normally return.
 3222                                  */
 3223 
 3224                                 error = ENOMEM;
 3225                         }
 3226                 }
 3227                 current = current->next;
 3228         }
 3229         pmap_update(map->pmap);
 3230 
 3231  out:
 3232         vm_map_unlock(map);
 3233 
 3234         UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
 3235         return error;
 3236 }
 3237 
 3238 #undef  MASK
 3239 
 3240 /*
 3241  * uvm_map_inherit: set inheritance code for range of addrs in map.
 3242  *
 3243  * => map must be unlocked
 3244  * => note that the inherit code is used during a "fork".  see fork
 3245  *      code for details.
 3246  */
 3247 
 3248 int
 3249 uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
 3250     vm_inherit_t new_inheritance)
 3251 {
 3252         struct vm_map_entry *entry, *temp_entry;
 3253         UVMHIST_FUNC(__func__);
 3254         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
 3255             (uintptr_t)map, start, end, new_inheritance);
 3256 
 3257         switch (new_inheritance) {
 3258         case MAP_INHERIT_NONE:
 3259         case MAP_INHERIT_COPY:
 3260         case MAP_INHERIT_SHARE:
 3261         case MAP_INHERIT_ZERO:
 3262                 break;
 3263         default:
 3264                 UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
 3265                 return EINVAL;
 3266         }
 3267 
 3268         vm_map_lock(map);
 3269         VM_MAP_RANGE_CHECK(map, start, end);
 3270         if (uvm_map_lookup_entry(map, start, &temp_entry)) {
 3271                 entry = temp_entry;
 3272                 UVM_MAP_CLIP_START(map, entry, start);
 3273         }  else {
 3274                 entry = temp_entry->next;
 3275         }
 3276         while ((entry != &map->header) && (entry->start < end)) {
 3277                 UVM_MAP_CLIP_END(map, entry, end);
 3278                 entry->inheritance = new_inheritance;
 3279                 entry = entry->next;
 3280         }
 3281         vm_map_unlock(map);
 3282         UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
 3283         return 0;
 3284 }
 3285 
 3286 /*
 3287  * uvm_map_advice: set advice code for range of addrs in map.
 3288  *
 3289  * => map must be unlocked
 3290  */
 3291 
 3292 int
 3293 uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
 3294 {
 3295         struct vm_map_entry *entry, *temp_entry;
 3296         UVMHIST_FUNC(__func__);
 3297         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
 3298             (uintptr_t)map, start, end, new_advice);
 3299 
 3300         vm_map_lock(map);
 3301         VM_MAP_RANGE_CHECK(map, start, end);
 3302         if (uvm_map_lookup_entry(map, start, &temp_entry)) {
 3303                 entry = temp_entry;
 3304                 UVM_MAP_CLIP_START(map, entry, start);
 3305         } else {
 3306                 entry = temp_entry->next;
 3307         }
 3308 
 3309         /*
 3310          * XXXJRT: disallow holes?
 3311          */
 3312 
 3313         while ((entry != &map->header) && (entry->start < end)) {
 3314                 UVM_MAP_CLIP_END(map, entry, end);
 3315 
 3316                 switch (new_advice) {
 3317                 case MADV_NORMAL:
 3318                 case MADV_RANDOM:
 3319                 case MADV_SEQUENTIAL:
 3320                         /* nothing special here */
 3321                         break;
 3322 
 3323                 default:
 3324                         vm_map_unlock(map);
 3325                         UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
 3326                         return EINVAL;
 3327                 }
 3328                 entry->advice = new_advice;
 3329                 entry = entry->next;
 3330         }
 3331 
 3332         vm_map_unlock(map);
 3333         UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
 3334         return 0;
 3335 }
 3336 
 3337 /*
 3338  * uvm_map_willneed: apply MADV_WILLNEED
 3339  */
 3340 
 3341 int
 3342 uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
 3343 {
 3344         struct vm_map_entry *entry;
 3345         UVMHIST_FUNC(__func__);
 3346         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
 3347             (uintptr_t)map, start, end, 0);
 3348 
 3349         vm_map_lock_read(map);
 3350         VM_MAP_RANGE_CHECK(map, start, end);
 3351         if (!uvm_map_lookup_entry(map, start, &entry)) {
 3352                 entry = entry->next;
 3353         }
 3354         while (entry->start < end) {
 3355                 struct vm_amap * const amap = entry->aref.ar_amap;
 3356                 struct uvm_object * const uobj = entry->object.uvm_obj;
 3357 
 3358                 KASSERT(entry != &map->header);
 3359                 KASSERT(start < entry->end);
 3360                 /*
 3361                  * For now, we handle only the easy but commonly-requested case.
 3362                  * ie. start prefetching of backing uobj pages.
 3363                  *
 3364                  * XXX It might be useful to pmap_enter() the already-in-core
 3365                  * pages by inventing a "weak" mode for uvm_fault() which would
 3366                  * only do the PGO_LOCKED pgo_get().
 3367                  */
 3368                 if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
 3369                         off_t offset;
 3370                         off_t size;
 3371 
 3372                         offset = entry->offset;
 3373                         if (start < entry->start) {
 3374                                 offset += entry->start - start;
 3375                         }
 3376                         size = entry->offset + (entry->end - entry->start);
 3377                         if (entry->end < end) {
 3378                                 size -= end - entry->end;
 3379                         }
 3380                         uvm_readahead(uobj, offset, size);
 3381                 }
 3382                 entry = entry->next;
 3383         }
 3384         vm_map_unlock_read(map);
 3385         UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
 3386         return 0;
 3387 }
 3388 
 3389 /*
 3390  * uvm_map_pageable: sets the pageability of a range in a map.
 3391  *
 3392  * => wires map entries.  should not be used for transient page locking.
 3393  *      for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
 3394  * => regions specified as not pageable require lock-down (wired) memory
 3395  *      and page tables.
 3396  * => map must never be read-locked
 3397  * => if islocked is true, map is already write-locked
 3398  * => we always unlock the map, since we must downgrade to a read-lock
 3399  *      to call uvm_fault_wire()
 3400  * => XXXCDC: check this and try and clean it up.
 3401  */
 3402 
 3403 int
 3404 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
 3405     bool new_pageable, int lockflags)
 3406 {
 3407         struct vm_map_entry *entry, *start_entry, *failed_entry;
 3408         int rv;
 3409 #ifdef DIAGNOSTIC
 3410         u_int timestamp_save;
 3411 #endif
 3412         UVMHIST_FUNC(__func__);
 3413         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
 3414             (uintptr_t)map, start, end, new_pageable);
 3415         KASSERT(map->flags & VM_MAP_PAGEABLE);
 3416 
 3417         if ((lockflags & UVM_LK_ENTER) == 0)
 3418                 vm_map_lock(map);
 3419         VM_MAP_RANGE_CHECK(map, start, end);
 3420 
 3421         /*
 3422          * only one pageability change may take place at one time, since
 3423          * uvm_fault_wire assumes it will be called only once for each
 3424          * wiring/unwiring.  therefore, we have to make sure we're actually
 3425          * changing the pageability for the entire region.  we do so before
 3426          * making any changes.
 3427          */
 3428 
 3429         if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
 3430                 if ((lockflags & UVM_LK_EXIT) == 0)
 3431                         vm_map_unlock(map);
 3432 
 3433                 UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
 3434                 return EFAULT;
 3435         }
 3436         entry = start_entry;
 3437 
 3438         if (start == end) {             /* nothing required */
 3439                 if ((lockflags & UVM_LK_EXIT) == 0)
 3440                         vm_map_unlock(map);
 3441 
 3442                 UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
 3443                 return 0;
 3444         }
 3445 
 3446         /*
 3447          * handle wiring and unwiring separately.
 3448          */
 3449 
 3450         if (new_pageable) {             /* unwire */
 3451                 UVM_MAP_CLIP_START(map, entry, start);
 3452 
 3453                 /*
 3454                  * unwiring.  first ensure that the range to be unwired is
 3455                  * really wired down and that there are no holes.
 3456                  */
 3457 
 3458                 while ((entry != &map->header) && (entry->start < end)) {
 3459                         if (entry->wired_count == 0 ||
 3460                             (entry->end < end &&
 3461                              (entry->next == &map->header ||
 3462                               entry->next->start > entry->end))) {
 3463                                 if ((lockflags & UVM_LK_EXIT) == 0)
 3464                                         vm_map_unlock(map);
 3465                                 UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
 3466                                 return EINVAL;
 3467                         }
 3468                         entry = entry->next;
 3469                 }
 3470 
 3471                 /*
 3472                  * POSIX 1003.1b - a single munlock call unlocks a region,
 3473                  * regardless of the number of mlock calls made on that
 3474                  * region.
 3475                  */
 3476 
 3477                 entry = start_entry;
 3478                 while ((entry != &map->header) && (entry->start < end)) {
 3479                         UVM_MAP_CLIP_END(map, entry, end);
 3480                         if (VM_MAPENT_ISWIRED(entry))
 3481                                 uvm_map_entry_unwire(map, entry);
 3482                         entry = entry->next;
 3483                 }
 3484                 if ((lockflags & UVM_LK_EXIT) == 0)
 3485                         vm_map_unlock(map);
 3486                 UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
 3487                 return 0;
 3488         }
 3489 
 3490         /*
 3491          * wire case: in two passes [XXXCDC: ugly block of code here]
 3492          *
 3493          * 1: holding the write lock, we create any anonymous maps that need
 3494          *    to be created.  then we clip each map entry to the region to
 3495          *    be wired and increment its wiring count.
 3496          *
 3497          * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
 3498          *    in the pages for any newly wired area (wired_count == 1).
 3499          *
 3500          *    downgrading to a read lock for uvm_fault_wire avoids a possible
 3501          *    deadlock with another thread that may have faulted on one of
 3502          *    the pages to be wired (it would mark the page busy, blocking
 3503          *    us, then in turn block on the map lock that we hold).  because
 3504          *    of problems in the recursive lock package, we cannot upgrade
 3505          *    to a write lock in vm_map_lookup.  thus, any actions that
 3506          *    require the write lock must be done beforehand.  because we
 3507          *    keep the read lock on the map, the copy-on-write status of the
 3508          *    entries we modify here cannot change.
 3509          */
 3510 
 3511         while ((entry != &map->header) && (entry->start < end)) {
 3512                 if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
 3513 
 3514                         /*
 3515                          * perform actions of vm_map_lookup that need the
 3516                          * write lock on the map: create an anonymous map
 3517                          * for a copy-on-write region, or an anonymous map
 3518                          * for a zero-fill region.  (XXXCDC: submap case
 3519                          * ok?)
 3520                          */
 3521 
 3522                         if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
 3523                                 if (UVM_ET_ISNEEDSCOPY(entry) &&
 3524                                     ((entry->max_protection & VM_PROT_WRITE) ||
 3525                                      (entry->object.uvm_obj == NULL))) {
 3526                                         amap_copy(map, entry, 0, start, end);
 3527                                         /* XXXCDC: wait OK? */
 3528                                 }
 3529                         }
 3530                 }
 3531                 UVM_MAP_CLIP_START(map, entry, start);
 3532                 UVM_MAP_CLIP_END(map, entry, end);
 3533                 entry->wired_count++;
 3534 
 3535                 /*
 3536                  * Check for holes
 3537                  */
 3538 
 3539                 if (entry->protection == VM_PROT_NONE ||
 3540                     (entry->end < end &&
 3541                      (entry->next == &map->header ||
 3542                       entry->next->start > entry->end))) {
 3543 
 3544                         /*
 3545                          * found one.  amap creation actions do not need to
 3546                          * be undone, but the wired counts need to be restored.
 3547                          */
 3548 
 3549                         while (entry != &map->header && entry->end > start) {
 3550                                 entry->wired_count--;
 3551                                 entry = entry->prev;
 3552                         }
 3553                         if ((lockflags & UVM_LK_EXIT) == 0)
 3554                                 vm_map_unlock(map);
 3555                         UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
 3556                         return EINVAL;
 3557                 }
 3558                 entry = entry->next;
 3559         }
 3560 
 3561         /*
 3562          * Pass 2.
 3563          */
 3564 
 3565 #ifdef DIAGNOSTIC
 3566         timestamp_save = map->timestamp;
 3567 #endif
 3568         vm_map_busy(map);
 3569         vm_map_unlock(map);
 3570 
 3571         rv = 0;
 3572         entry = start_entry;
 3573         while (entry != &map->header && entry->start < end) {
 3574                 if (entry->wired_count == 1) {
 3575                         rv = uvm_fault_wire(map, entry->start, entry->end,
 3576                             entry->max_protection, 1);
 3577                         if (rv) {
 3578 
 3579                                 /*
 3580                                  * wiring failed.  break out of the loop.
 3581                                  * we'll clean up the map below, once we
 3582                                  * have a write lock again.
 3583                                  */
 3584 
 3585                                 break;
 3586                         }
 3587                 }
 3588                 entry = entry->next;
 3589         }
 3590 
 3591         if (rv) {       /* failed? */
 3592 
 3593                 /*
 3594                  * Get back to an exclusive (write) lock.
 3595                  */
 3596 
 3597                 vm_map_lock(map);
 3598                 vm_map_unbusy(map);
 3599 
 3600 #ifdef DIAGNOSTIC
 3601                 if (timestamp_save + 1 != map->timestamp)
 3602                         panic("uvm_map_pageable: stale map");
 3603 #endif
 3604 
 3605                 /*
 3606                  * first drop the wiring count on all the entries
 3607                  * which haven't actually been wired yet.
 3608                  */
 3609 
 3610                 failed_entry = entry;
 3611                 while (entry != &map->header && entry->start < end) {
 3612                         entry->wired_count--;
 3613                         entry = entry->next;
 3614                 }
 3615 
 3616                 /*
 3617                  * now, unwire all the entries that were successfully
 3618                  * wired above.
 3619                  */
 3620 
 3621                 entry = start_entry;
 3622                 while (entry != failed_entry) {
 3623                         entry->wired_count--;
 3624                         if (VM_MAPENT_ISWIRED(entry) == 0)
 3625                                 uvm_map_entry_unwire(map, entry);
 3626                         entry = entry->next;
 3627                 }
 3628                 if ((lockflags & UVM_LK_EXIT) == 0)
 3629                         vm_map_unlock(map);
 3630                 UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
 3631                 return (rv);
 3632         }
 3633 
 3634         if ((lockflags & UVM_LK_EXIT) == 0) {
 3635                 vm_map_unbusy(map);
 3636         } else {
 3637 
 3638                 /*
 3639                  * Get back to an exclusive (write) lock.
 3640                  */
 3641 
 3642                 vm_map_lock(map);
 3643                 vm_map_unbusy(map);
 3644         }
 3645 
 3646         UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
 3647         return 0;
 3648 }
 3649 
 3650 /*
 3651  * uvm_map_pageable_all: special case of uvm_map_pageable - affects
 3652  * all mapped regions.
 3653  *
 3654  * => map must not be locked.
 3655  * => if no flags are specified, all regions are unwired.
 3656  * => XXXJRT: has some of the same problems as uvm_map_pageable() above.
 3657  */
 3658 
 3659 int
 3660 uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
 3661 {
 3662         struct vm_map_entry *entry, *failed_entry;
 3663         vsize_t size;
 3664         int rv;
 3665 #ifdef DIAGNOSTIC
 3666         u_int timestamp_save;
 3667 #endif
 3668         UVMHIST_FUNC(__func__);
 3669         UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
 3670             0, 0);
 3671 
 3672         KASSERT(map->flags & VM_MAP_PAGEABLE);
 3673 
 3674         vm_map_lock(map);
 3675 
 3676         /*
 3677          * handle wiring and unwiring separately.
 3678          */
 3679 
 3680         if (flags == 0) {                       /* unwire */
 3681 
 3682                 /*
 3683                  * POSIX 1003.1b -- munlockall unlocks all regions,
 3684                  * regardless of how many times mlockall has been called.
 3685                  */
 3686 
 3687                 for (entry = map->header.next; entry != &map->header;
 3688                      entry = entry->next) {
 3689                         if (VM_MAPENT_ISWIRED(entry))
 3690                                 uvm_map_entry_unwire(map, entry);
 3691                 }
 3692                 map->flags &= ~VM_MAP_WIREFUTURE;
 3693                 vm_map_unlock(map);
 3694                 UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
 3695                 return 0;
 3696         }
 3697 
 3698         if (flags & MCL_FUTURE) {
 3699 
 3700                 /*
 3701                  * must wire all future mappings; remember this.
 3702                  */
 3703 
 3704                 map->flags |= VM_MAP_WIREFUTURE;
 3705         }
 3706 
 3707         if ((flags & MCL_CURRENT) == 0) {
 3708 
 3709                 /*
 3710                  * no more work to do!
 3711                  */
 3712 
 3713                 UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
 3714                 vm_map_unlock(map);
 3715                 return 0;
 3716         }
 3717 
 3718         /*
 3719          * wire case: in three passes [XXXCDC: ugly block of code here]
 3720          *
 3721          * 1: holding the write lock, count all pages mapped by non-wired
 3722          *    entries.  if this would cause us to go over our limit, we fail.
 3723          *
 3724          * 2: still holding the write lock, we create any anonymous maps that
 3725          *    need to be created.  then we increment its wiring count.
 3726          *
 3727          * 3: we downgrade to a read lock, and call uvm_fault_wire to fault
 3728          *    in the pages for any newly wired area (wired_count == 1).
 3729          *
 3730          *    downgrading to a read lock for uvm_fault_wire avoids a possible
 3731          *    deadlock with another thread that may have faulted on one of
 3732          *    the pages to be wired (it would mark the page busy, blocking
 3733          *    us, then in turn block on the map lock that we hold).  because
 3734          *    of problems in the recursive lock package, we cannot upgrade
 3735          *    to a write lock in vm_map_lookup.  thus, any actions that
 3736          *    require the write lock must be done beforehand.  because we
 3737          *    keep the read lock on the map, the copy-on-write status of the
 3738          *    entries we modify here cannot change.
 3739          */
 3740 
 3741         for (size = 0, entry = map->header.next; entry != &map->header;
 3742              entry = entry->next) {
 3743                 if (entry->protection != VM_PROT_NONE &&
 3744                     VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
 3745                         size += entry->end - entry->start;
 3746                 }
 3747         }
 3748 
 3749         if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
 3750                 vm_map_unlock(map);
 3751                 return ENOMEM;
 3752         }
 3753 
 3754         if (limit != 0 &&
 3755             (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
 3756                 vm_map_unlock(map);
 3757                 return ENOMEM;
 3758         }
 3759 
 3760         /*
 3761          * Pass 2.
 3762          */
 3763 
 3764         for (entry = map->header.next; entry != &map->header;
 3765              entry = entry->next) {
 3766                 if (entry->protection == VM_PROT_NONE)
 3767                         continue;
 3768                 if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
 3769 
 3770                         /*
 3771                          * perform actions of vm_map_lookup that need the
 3772                          * write lock on the map: create an anonymous map
 3773                          * for a copy-on-write region, or an anonymous map
 3774                          * for a zero-fill region.  (XXXCDC: submap case
 3775                          * ok?)
 3776                          */
 3777 
 3778                         if (!UVM_ET_ISSUBMAP(entry)) {  /* not submap */
 3779                                 if (UVM_ET_ISNEEDSCOPY(entry) &&
 3780                                     ((entry->max_protection & VM_PROT_WRITE) ||
 3781                                      (entry->object.uvm_obj == NULL))) {
 3782                                         amap_copy(map, entry, 0, entry->start,
 3783                                             entry->end);
 3784                                         /* XXXCDC: wait OK? */
 3785                                 }
 3786                         }
 3787                 }
 3788                 entry->wired_count++;
 3789         }
 3790 
 3791         /*
 3792          * Pass 3.
 3793          */
 3794 
 3795 #ifdef DIAGNOSTIC
 3796         timestamp_save = map->timestamp;
 3797 #endif
 3798         vm_map_busy(map);
 3799         vm_map_unlock(map);
 3800 
 3801         rv = 0;
 3802         for (entry = map->header.next; entry != &map->header;
 3803              entry = entry->next) {
 3804                 if (entry->wired_count == 1) {
 3805                         rv = uvm_fault_wire(map, entry->start, entry->end,
 3806                             entry->max_protection, 1);
 3807                         if (rv) {
 3808 
 3809                                 /*
 3810                                  * wiring failed.  break out of the loop.
 3811                                  * we'll clean up the map below, once we
 3812                                  * have a write lock again.
 3813                                  */
 3814 
 3815                                 break;
 3816                         }
 3817                 }
 3818         }
 3819 
 3820         if (rv) {
 3821 
 3822                 /*
 3823                  * Get back an exclusive (write) lock.
 3824                  */
 3825 
 3826                 vm_map_lock(map);
 3827                 vm_map_unbusy(map);
 3828 
 3829 #ifdef DIAGNOSTIC
 3830                 if (timestamp_save + 1 != map->timestamp)
 3831                         panic("uvm_map_pageable_all: stale map");
 3832 #endif
 3833 
 3834                 /*
 3835                  * first drop the wiring count on all the entries
 3836                  * which haven't actually been wired yet.
 3837                  *
 3838                  * Skip VM_PROT_NONE entries like we did above.
 3839                  */
 3840 
 3841                 failed_entry = entry;
 3842                 for (/* nothing */; entry != &map->header;
 3843                      entry = entry->next) {
 3844                         if (entry->protection == VM_PROT_NONE)
 3845                                 continue;
 3846                         entry->wired_count--;
 3847                 }
 3848 
 3849                 /*
 3850                  * now, unwire all the entries that were successfully
 3851                  * wired above.
 3852                  *
 3853                  * Skip VM_PROT_NONE entries like we did above.
 3854                  */
 3855 
 3856                 for (entry = map->header.next; entry != failed_entry;
 3857                      entry = entry->next) {
 3858                         if (entry->protection == VM_PROT_NONE)
 3859                                 continue;
 3860                         entry->wired_count--;
 3861                         if (VM_MAPENT_ISWIRED(entry))
 3862                                 uvm_map_entry_unwire(map, entry);
 3863                 }
 3864                 vm_map_unlock(map);
 3865                 UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
 3866                 return (rv);
 3867         }
 3868 
 3869         vm_map_unbusy(map);
 3870 
 3871         UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
 3872         return 0;
 3873 }
 3874 
 3875 /*
 3876  * uvm_map_clean: clean out a map range
 3877  *
 3878  * => valid flags:
 3879  *   if (flags & PGO_CLEANIT): dirty pages are cleaned first
 3880  *   if (flags & PGO_SYNCIO): dirty pages are written synchronously
 3881  *   if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
 3882  *   if (flags & PGO_FREE): any cached pages are freed after clean
 3883  * => returns an error if any part of the specified range isn't mapped
 3884  * => never a need to flush amap layer since the anonymous memory has
 3885  *      no permanent home, but may deactivate pages there
 3886  * => called from sys_msync() and sys_madvise()
 3887  * => caller must not write-lock map (read OK).
 3888  * => we may sleep while cleaning if SYNCIO [with map read-locked]
 3889  */
 3890 
 3891 int
 3892 uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
 3893 {
 3894         struct vm_map_entry *current, *entry;
 3895         struct uvm_object *uobj;
 3896         struct vm_amap *amap;
 3897         struct vm_anon *anon;
 3898         struct vm_page *pg;
 3899         vaddr_t offset;
 3900         vsize_t size;
 3901         voff_t uoff;
 3902         int error, refs;
 3903         UVMHIST_FUNC(__func__);
 3904         UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
 3905             (uintptr_t)map, start, end, flags);
 3906 
 3907         KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
 3908                 (PGO_FREE|PGO_DEACTIVATE));
 3909 
 3910         vm_map_lock_read(map);
 3911         VM_MAP_RANGE_CHECK(map, start, end);
 3912         if (uvm_map_lookup_entry(map, start, &entry) == false) {
 3913                 vm_map_unlock_read(map);
 3914                 return EFAULT;
 3915         }
 3916 
 3917         /*
 3918          * Make a first pass to check for holes and wiring problems.
 3919          */
 3920 
 3921         for (current = entry; current->start < end; current = current->next) {
 3922                 if (UVM_ET_ISSUBMAP(current)) {
 3923                         vm_map_unlock_read(map);
 3924                         return EINVAL;
 3925                 }
 3926                 if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) {
 3927                         vm_map_unlock_read(map);
 3928                         return EBUSY;
 3929                 }
 3930                 if (end <= current->end) {
 3931                         break;
 3932                 }
 3933                 if (current->end != current->next->start) {
 3934                         vm_map_unlock_read(map);
 3935                         return EFAULT;
 3936                 }
 3937         }
 3938 
 3939         error = 0;
 3940         for (current = entry; start < end; current = current->next) {
 3941                 amap = current->aref.ar_amap;   /* upper layer */
 3942                 uobj = current->object.uvm_obj; /* lower layer */
 3943                 KASSERT(start >= current->start);
 3944 
 3945                 /*
 3946                  * No amap cleaning necessary if:
 3947                  *
 3948                  *      (1) There's no amap.
 3949                  *
 3950                  *      (2) We're not deactivating or freeing pages.
 3951                  */
 3952 
 3953                 if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
 3954                         goto flush_object;
 3955 
 3956                 offset = start - current->start;
 3957                 size = MIN(end, current->end) - start;
 3958 
 3959                 amap_lock(amap, RW_WRITER);
 3960                 for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
 3961                         anon = amap_lookup(&current->aref, offset);
 3962                         if (anon == NULL)
 3963                                 continue;
 3964 
 3965                         KASSERT(anon->an_lock == amap->am_lock);
 3966                         pg = anon->an_page;
 3967                         if (pg == NULL) {
 3968                                 continue;
 3969                         }
 3970                         if (pg->flags & PG_BUSY) {
 3971                                 continue;
 3972                         }
 3973 
 3974                         switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
 3975 
 3976                         /*
 3977                          * In these first 3 cases, we just deactivate the page.
 3978                          */
 3979 
 3980                         case PGO_CLEANIT|PGO_FREE:
 3981                         case PGO_CLEANIT|PGO_DEACTIVATE:
 3982                         case PGO_DEACTIVATE:
 3983  deactivate_it:
 3984                                 /*
 3985                                  * skip the page if it's loaned or wired,
 3986                                  * since it shouldn't be on a paging queue
 3987                                  * at all in these cases.
 3988                                  */
 3989 
 3990                                 if (pg->loan_count != 0 ||
 3991                                     pg->wire_count != 0) {
 3992                                         continue;
 3993                                 }
 3994                                 KASSERT(pg->uanon == anon);
 3995                                 uvm_pagelock(pg);
 3996                                 uvm_pagedeactivate(pg);
 3997                                 uvm_pageunlock(pg);
 3998                                 continue;
 3999 
 4000                         case PGO_FREE:
 4001 
 4002                                 /*
 4003                                  * If there are multiple references to
 4004                                  * the amap, just deactivate the page.
 4005                                  */
 4006 
 4007                                 if (amap_refs(amap) > 1)
 4008                                         goto deactivate_it;
 4009 
 4010                                 /* skip the page if it's wired */
 4011                                 if (pg->wire_count != 0) {
 4012                                         continue;
 4013                                 }
 4014                                 amap_unadd(&current->aref, offset);
 4015                                 refs = --anon->an_ref;
 4016                                 if (refs == 0) {
 4017                                         uvm_anfree(anon);
 4018                                 }
 4019                                 continue;
 4020                         }
 4021                 }
 4022                 amap_unlock(amap);
 4023 
 4024  flush_object:
 4025                 /*
 4026                  * flush pages if we've got a valid backing object.
 4027                  * note that we must always clean object pages before
 4028                  * freeing them since otherwise we could reveal stale
 4029                  * data from files.
 4030                  */
 4031 
 4032                 uoff = current->offset + (start - current->start);
 4033                 size = MIN(end, current->end) - start;
 4034                 if (uobj != NULL) {
 4035                         rw_enter(uobj->vmobjlock, RW_WRITER);
 4036                         if (uobj->pgops->pgo_put != NULL)
 4037                                 error = (uobj->pgops->pgo_put)(uobj, uoff,
 4038                                     uoff + size, flags | PGO_CLEANIT);
 4039                         else
 4040                                 error = 0;
 4041                 }
 4042                 start += size;
 4043         }
 4044         vm_map_unlock_read(map);
 4045         return (error);
 4046 }
 4047 
 4048 
 4049 /*
 4050  * uvm_map_checkprot: check protection in map
 4051  *
 4052  * => must allow specified protection in a fully allocated region.
 4053  * => map must be read or write locked by caller.
 4054  */
 4055 
 4056 bool
 4057 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
 4058     vm_prot_t protection)
 4059 {
 4060         struct vm_map_entry *entry;
 4061         struct vm_map_entry *tmp_entry;
 4062 
 4063         if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
 4064                 return (false);
 4065         }
 4066         entry = tmp_entry;
 4067         while (start < end) {
 4068                 if (entry == &map->header) {
 4069                         return (false);
 4070                 }
 4071 
 4072                 /*
 4073                  * no holes allowed
 4074                  */
 4075 
 4076                 if (start < entry->start) {
 4077                         return (false);
 4078                 }
 4079 
 4080                 /*
 4081                  * check protection associated with entry
 4082                  */
 4083 
 4084                 if ((entry->protection & protection) != protection) {
 4085                         return (false);
 4086                 }
 4087                 start = entry->end;
 4088                 entry = entry->next;
 4089         }
 4090         return (true);
 4091 }
 4092 
 4093 /*
 4094  * uvmspace_alloc: allocate a vmspace structure.
 4095  *
 4096  * - structure includes vm_map and pmap
 4097  * - XXX: no locking on this structure
 4098  * - refcnt set to 1, rest must be init'd by caller
 4099  */
 4100 struct vmspace *
 4101 uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
 4102 {
 4103         struct vmspace *vm;
 4104         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 4105 
 4106         vm = pool_cache_get(&uvm_vmspace_cache, PR_WAITOK);
 4107         uvmspace_init(vm, NULL, vmin, vmax, topdown);
 4108         UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
 4109         return (vm);
 4110 }
 4111 
 4112 /*
 4113  * uvmspace_init: initialize a vmspace structure.
 4114  *
 4115  * - XXX: no locking on this structure
 4116  * - refcnt set to 1, rest must be init'd by caller
 4117  */
 4118 void
 4119 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
 4120     vaddr_t vmax, bool topdown)
 4121 {
 4122         UVMHIST_FUNC(__func__);
 4123         UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
 4124             (uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
 4125         UVMHIST_LOG(maphist, "   topdown=%ju)", topdown, 0, 0, 0);
 4126 
 4127         memset(vm, 0, sizeof(*vm));
 4128         uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
 4129             | (topdown ? VM_MAP_TOPDOWN : 0)
 4130             );
 4131         if (pmap)
 4132                 pmap_reference(pmap);
 4133         else
 4134                 pmap = pmap_create();
 4135         vm->vm_map.pmap = pmap;
 4136         vm->vm_refcnt = 1;
 4137         UVMHIST_LOG(maphist,"<- done",0,0,0,0);
 4138 }
 4139 
 4140 /*
 4141  * uvmspace_share: share a vmspace between two processes
 4142  *
 4143  * - used for vfork, threads(?)
 4144  */
 4145 
 4146 void
 4147 uvmspace_share(struct proc *p1, struct proc *p2)
 4148 {
 4149 
 4150         uvmspace_addref(p1->p_vmspace);
 4151         p2->p_vmspace = p1->p_vmspace;
 4152 }
 4153 
 4154 #if 0
 4155 
 4156 /*
 4157  * uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
 4158  *
 4159  * - XXX: no locking on vmspace
 4160  */
 4161 
 4162 void
 4163 uvmspace_unshare(struct lwp *l)
 4164 {
 4165         struct proc *p = l->l_proc;
 4166         struct vmspace *nvm, *ovm = p->p_vmspace;
 4167 
 4168         if (ovm->vm_refcnt == 1)
 4169                 /* nothing to do: vmspace isn't shared in the first place */
 4170                 return;
 4171 
 4172         /* make a new vmspace, still holding old one */
 4173         nvm = uvmspace_fork(ovm);
 4174 
 4175         kpreempt_disable();
 4176         pmap_deactivate(l);             /* unbind old vmspace */
 4177         p->p_vmspace = nvm;
 4178         pmap_activate(l);               /* switch to new vmspace */
 4179         kpreempt_enable();
 4180 
 4181         uvmspace_free(ovm);             /* drop reference to old vmspace */
 4182 }
 4183 
 4184 #endif
 4185 
 4186 
 4187 /*
 4188  * uvmspace_spawn: a new process has been spawned and needs a vmspace
 4189  */
 4190 
 4191 void
 4192 uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
 4193 {
 4194         struct proc *p = l->l_proc;
 4195         struct vmspace *nvm;
 4196 
 4197 #ifdef __HAVE_CPU_VMSPACE_EXEC
 4198         cpu_vmspace_exec(l, start, end);
 4199 #endif
 4200 
 4201         nvm = uvmspace_alloc(start, end, topdown);
 4202         kpreempt_disable();
 4203         p->p_vmspace = nvm;
 4204         pmap_activate(l);
 4205         kpreempt_enable();
 4206 }
 4207 
 4208 /*
 4209  * uvmspace_exec: the process wants to exec a new program
 4210  */
 4211 
 4212 void
 4213 uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
 4214 {
 4215         struct proc *p = l->l_proc;
 4216         struct vmspace *nvm, *ovm = p->p_vmspace;
 4217         struct vm_map *map;
 4218         int flags;
 4219 
 4220         KASSERT(ovm != NULL);
 4221 #ifdef __HAVE_CPU_VMSPACE_EXEC
 4222         cpu_vmspace_exec(l, start, end);
 4223 #endif
 4224 
 4225         map = &ovm->vm_map;
 4226         /*
 4227          * see if more than one process is using this vmspace...
 4228          */
 4229 
 4230         if (ovm->vm_refcnt == 1
 4231             && topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) {
 4232 
 4233                 /*
 4234                  * if p is the only process using its vmspace then we can safely
 4235                  * recycle that vmspace for the program that is being exec'd.
 4236                  * But only if TOPDOWN matches the requested value for the new
 4237                  * vm space!
 4238                  */
 4239 
 4240                 /*
 4241                  * SYSV SHM semantics require us to kill all segments on an exec
 4242                  */
 4243                 if (uvm_shmexit && ovm->vm_shm)
 4244                         (*uvm_shmexit)(ovm);
 4245 
 4246                 /*
 4247                  * POSIX 1003.1b -- "lock future mappings" is revoked
 4248                  * when a process execs another program image.
 4249                  */
 4250 
 4251                 map->flags &= ~VM_MAP_WIREFUTURE;
 4252 
 4253                 /*
 4254                  * now unmap the old program.
 4255                  *
 4256                  * XXX set VM_MAP_DYING for the duration, so pmap_update()
 4257                  * is not called until the pmap has been totally cleared out
 4258                  * after pmap_remove_all(), or it can confuse some pmap
 4259                  * implementations.  it would be nice to handle this by
 4260                  * deferring the pmap_update() while it is known the address
 4261                  * space is not visible to any user LWP other than curlwp,
 4262                  * but there isn't an elegant way of inferring that right
 4263                  * now.
 4264                  */
 4265 
 4266                 flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
 4267                 map->flags |= VM_MAP_DYING;
 4268                 uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags);
 4269                 map->flags &= ~VM_MAP_DYING;
 4270                 pmap_update(map->pmap);
 4271                 KASSERT(map->header.prev == &map->header);
 4272                 KASSERT(map->nentries == 0);
 4273 
 4274                 /*
 4275                  * resize the map
 4276                  */
 4277 
 4278                 vm_map_setmin(map, start);
 4279                 vm_map_setmax(map, end);
 4280         } else {
 4281 
 4282                 /*
 4283                  * p's vmspace is being shared, so we can't reuse it for p since
 4284                  * it is still being used for others.   allocate a new vmspace
 4285                  * for p
 4286                  */
 4287 
 4288                 nvm = uvmspace_alloc(start, end, topdown);
 4289 
 4290                 /*
 4291                  * install new vmspace and drop our ref to the old one.
 4292                  */
 4293 
 4294                 kpreempt_disable();
 4295                 pmap_deactivate(l);
 4296                 p->p_vmspace = nvm;
 4297                 pmap_activate(l);
 4298                 kpreempt_enable();
 4299 
 4300                 uvmspace_free(ovm);
 4301         }
 4302 }
 4303 
 4304 /*
 4305  * uvmspace_addref: add a reference to a vmspace.
 4306  */
 4307 
 4308 void
 4309 uvmspace_addref(struct vmspace *vm)
 4310 {
 4311 
 4312         KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0);
 4313         KASSERT(vm->vm_refcnt > 0);
 4314         atomic_inc_uint(&vm->vm_refcnt);
 4315 }
 4316 
 4317 /*
 4318  * uvmspace_free: free a vmspace data structure
 4319  */
 4320 
 4321 void
 4322 uvmspace_free(struct vmspace *vm)
 4323 {
 4324         struct vm_map_entry *dead_entries;
 4325         struct vm_map *map = &vm->vm_map;
 4326         int flags;
 4327 
 4328         UVMHIST_FUNC(__func__);
 4329         UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm,
 4330             vm->vm_refcnt, 0, 0);
 4331 
 4332         membar_release();
 4333         if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0)
 4334                 return;
 4335         membar_acquire();
 4336 
 4337         /*
 4338          * at this point, there should be no other references to the map.
 4339          * delete all of the mappings, then destroy the pmap.
 4340          */
 4341 
 4342         map->flags |= VM_MAP_DYING;
 4343         flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
 4344 
 4345         /* Get rid of any SYSV shared memory segments. */
 4346         if (uvm_shmexit && vm->vm_shm != NULL)
 4347                 (*uvm_shmexit)(vm);
 4348 
 4349         if (map->nentries) {
 4350                 uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
 4351                     &dead_entries, flags);
 4352                 if (dead_entries != NULL)
 4353                         uvm_unmap_detach(dead_entries, 0);
 4354         }
 4355         KASSERT(map->nentries == 0);
 4356         KASSERT(map->size == 0);
 4357 
 4358         mutex_destroy(&map->misc_lock);
 4359         rw_destroy(&map->lock);
 4360         cv_destroy(&map->cv);
 4361         pmap_destroy(map->pmap);
 4362         pool_cache_put(&uvm_vmspace_cache, vm);
 4363 }
 4364 
 4365 static struct vm_map_entry *
 4366 uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
 4367     int flags)
 4368 {
 4369         struct vm_map_entry *new_entry;
 4370 
 4371         new_entry = uvm_mapent_alloc(new_map, 0);
 4372         /* old_entry -> new_entry */
 4373         uvm_mapent_copy(old_entry, new_entry);
 4374 
 4375         /* new pmap has nothing wired in it */
 4376         new_entry->wired_count = 0;
 4377 
 4378         /*
 4379          * gain reference to object backing the map (can't
 4380          * be a submap, already checked this case).
 4381          */
 4382 
 4383         if (new_entry->aref.ar_amap)
 4384                 uvm_map_reference_amap(new_entry, flags);
 4385 
 4386         if (new_entry->object.uvm_obj &&
 4387             new_entry->object.uvm_obj->pgops->pgo_reference)
 4388                 new_entry->object.uvm_obj->pgops->pgo_reference(
 4389                         new_entry->object.uvm_obj);
 4390 
 4391         /* insert entry at end of new_map's entry list */
 4392         uvm_map_entry_link(new_map, new_map->header.prev,
 4393             new_entry);
 4394 
 4395         return new_entry;
 4396 }
 4397 
 4398 /*
 4399  * share the mapping: this means we want the old and
 4400  * new entries to share amaps and backing objects.
 4401  */
 4402 static void
 4403 uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
 4404     struct vm_map_entry *old_entry)
 4405 {
 4406         /*
 4407          * if the old_entry needs a new amap (due to prev fork)
 4408          * then we need to allocate it now so that we have
 4409          * something we own to share with the new_entry.   [in
 4410          * other words, we need to clear needs_copy]
 4411          */
 4412 
 4413         if (UVM_ET_ISNEEDSCOPY(old_entry)) {
 4414                 /* get our own amap, clears needs_copy */
 4415                 amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
 4416                     0, 0);
 4417                 /* XXXCDC: WAITOK??? */
 4418         }
 4419 
 4420         uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
 4421 }
 4422 
 4423 
 4424 static void
 4425 uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
 4426     struct vm_map_entry *old_entry)
 4427 {
 4428         struct vm_map_entry *new_entry;
 4429 
 4430         /*
 4431          * copy-on-write the mapping (using mmap's
 4432          * MAP_PRIVATE semantics)
 4433          *
 4434          * allocate new_entry, adjust reference counts.
 4435          * (note that new references are read-only).
 4436          */
 4437 
 4438         new_entry = uvm_mapent_clone(new_map, old_entry, 0);
 4439 
 4440         new_entry->etype |=
 4441             (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
 4442 
 4443         /*
 4444          * the new entry will need an amap.  it will either
 4445          * need to be copied from the old entry or created
 4446          * from scratch (if the old entry does not have an
 4447          * amap).  can we defer this process until later
 4448          * (by setting "needs_copy") or do we need to copy
 4449          * the amap now?
 4450          *
 4451          * we must copy the amap now if any of the following
 4452          * conditions hold:
 4453          * 1. the old entry has an amap and that amap is
 4454          *    being shared.  this means that the old (parent)
 4455          *    process is sharing the amap with another
 4456          *    process.  if we do not clear needs_copy here
 4457          *    we will end up in a situation where both the
 4458          *    parent and child process are referring to the
 4459          *    same amap with "needs_copy" set.  if the
 4460          *    parent write-faults, the fault routine will
 4461          *    clear "needs_copy" in the parent by allocating
 4462          *    a new amap.   this is wrong because the
 4463          *    parent is supposed to be sharing the old amap
 4464          *    and the new amap will break that.
 4465          *
 4466          * 2. if the old entry has an amap and a non-zero
 4467          *    wire count then we are going to have to call
 4468          *    amap_cow_now to avoid page faults in the
 4469          *    parent process.   since amap_cow_now requires
 4470          *    "needs_copy" to be clear we might as well
 4471          *    clear it here as well.
 4472          *
 4473          */
 4474 
 4475         if (old_entry->aref.ar_amap != NULL) {
 4476                 if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
 4477                     VM_MAPENT_ISWIRED(old_entry)) {
 4478 
 4479                         amap_copy(new_map, new_entry,
 4480                             AMAP_COPY_NOCHUNK, 0, 0);
 4481                         /* XXXCDC: M_WAITOK ... ok? */
 4482                 }
 4483         }
 4484 
 4485         /*
 4486          * if the parent's entry is wired down, then the
 4487          * parent process does not want page faults on
 4488          * access to that memory.  this means that we
 4489          * cannot do copy-on-write because we can't write
 4490          * protect the old entry.   in this case we
 4491          * resolve all copy-on-write faults now, using
 4492          * amap_cow_now.   note that we have already
 4493          * allocated any needed amap (above).
 4494          */
 4495 
 4496         if (VM_MAPENT_ISWIRED(old_entry)) {
 4497 
 4498                 /*
 4499                  * resolve all copy-on-write faults now
 4500                  * (note that there is nothing to do if
 4501                  * the old mapping does not have an amap).
 4502                  */
 4503                 if (old_entry->aref.ar_amap)
 4504                         amap_cow_now(new_map, new_entry);
 4505 
 4506         } else {
 4507                 /*
 4508                  * setup mappings to trigger copy-on-write faults
 4509                  * we must write-protect the parent if it has
 4510                  * an amap and it is not already "needs_copy"...
 4511                  * if it is already "needs_copy" then the parent
 4512                  * has already been write-protected by a previous
 4513                  * fork operation.
 4514                  */
 4515                 if (old_entry->aref.ar_amap &&
 4516                     !UVM_ET_ISNEEDSCOPY(old_entry)) {
 4517                         if (old_entry->max_protection & VM_PROT_WRITE) {
 4518 #ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
 4519                                 uvm_map_lock_entry(old_entry, RW_WRITER);
 4520 #else
 4521                                 uvm_map_lock_entry(old_entry, RW_READER);
 4522 #endif
 4523                                 pmap_protect(old_map->pmap,
 4524                                     old_entry->start, old_entry->end,
 4525                                     old_entry->protection & ~VM_PROT_WRITE);
 4526                                 uvm_map_unlock_entry(old_entry);
 4527                         }
 4528                         old_entry->etype |= UVM_ET_NEEDSCOPY;
 4529                 }
 4530         }
 4531 }
 4532 
 4533 /*
 4534  * zero the mapping: the new entry will be zero initialized
 4535  */
 4536 static void
 4537 uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
 4538     struct vm_map_entry *old_entry)
 4539 {
 4540         struct vm_map_entry *new_entry;
 4541 
 4542         new_entry = uvm_mapent_clone(new_map, old_entry, 0);
 4543 
 4544         new_entry->etype |=
 4545             (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
 4546 
 4547         if (new_entry->aref.ar_amap) {
 4548                 uvm_map_unreference_amap(new_entry, 0);
 4549                 new_entry->aref.ar_pageoff = 0;
 4550                 new_entry->aref.ar_amap = NULL;
 4551         }
 4552 
 4553         if (UVM_ET_ISOBJ(new_entry)) {
 4554                 if (new_entry->object.uvm_obj->pgops->pgo_detach)
 4555                         new_entry->object.uvm_obj->pgops->pgo_detach(
 4556                             new_entry->object.uvm_obj);
 4557                 new_entry->object.uvm_obj = NULL;
 4558                 new_entry->offset = 0;
 4559                 new_entry->etype &= ~UVM_ET_OBJ;
 4560         }
 4561 }
 4562 
 4563 /*
 4564  *   F O R K   -   m a i n   e n t r y   p o i n t
 4565  */
 4566 /*
 4567  * uvmspace_fork: fork a process' main map
 4568  *
 4569  * => create a new vmspace for child process from parent.
 4570  * => parent's map must not be locked.
 4571  */
 4572 
 4573 struct vmspace *
 4574 uvmspace_fork(struct vmspace *vm1)
 4575 {
 4576         struct vmspace *vm2;
 4577         struct vm_map *old_map = &vm1->vm_map;
 4578         struct vm_map *new_map;
 4579         struct vm_map_entry *old_entry;
 4580         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 4581 
 4582         vm_map_lock(old_map);
 4583 
 4584         vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
 4585             vm1->vm_map.flags & VM_MAP_TOPDOWN);
 4586         memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
 4587             (char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
 4588         new_map = &vm2->vm_map;           /* XXX */
 4589 
 4590         old_entry = old_map->header.next;
 4591         new_map->size = old_map->size;
 4592 
 4593         /*
 4594          * go entry-by-entry
 4595          */
 4596 
 4597         while (old_entry != &old_map->header) {
 4598 
 4599                 /*
 4600                  * first, some sanity checks on the old entry
 4601                  */
 4602 
 4603                 KASSERT(!UVM_ET_ISSUBMAP(old_entry));
 4604                 KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
 4605                         !UVM_ET_ISNEEDSCOPY(old_entry));
 4606 
 4607                 switch (old_entry->inheritance) {
 4608                 case MAP_INHERIT_NONE:
 4609                         /*
 4610                          * drop the mapping, modify size
 4611                          */
 4612                         new_map->size -= old_entry->end - old_entry->start;
 4613                         break;
 4614 
 4615                 case MAP_INHERIT_SHARE:
 4616                         uvm_mapent_forkshared(new_map, old_map, old_entry);
 4617                         break;
 4618 
 4619                 case MAP_INHERIT_COPY:
 4620                         uvm_mapent_forkcopy(new_map, old_map, old_entry);
 4621                         break;
 4622 
 4623                 case MAP_INHERIT_ZERO:
 4624                         uvm_mapent_forkzero(new_map, old_map, old_entry);
 4625                         break;
 4626                 default:
 4627                         KASSERT(0);
 4628                         break;
 4629                 }
 4630                 old_entry = old_entry->next;
 4631         }
 4632 
 4633         pmap_update(old_map->pmap);
 4634         vm_map_unlock(old_map);
 4635 
 4636         if (uvm_shmfork && vm1->vm_shm)
 4637                 (*uvm_shmfork)(vm1, vm2);
 4638 
 4639 #ifdef PMAP_FORK
 4640         pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
 4641 #endif
 4642 
 4643         UVMHIST_LOG(maphist,"<- done",0,0,0,0);
 4644         return (vm2);
 4645 }
 4646 
 4647 
 4648 /*
 4649  * uvm_mapent_trymerge: try to merge an entry with its neighbors.
 4650  *
 4651  * => called with map locked.
 4652  * => return non zero if successfully merged.
 4653  */
 4654 
 4655 int
 4656 uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
 4657 {
 4658         struct uvm_object *uobj;
 4659         struct vm_map_entry *next;
 4660         struct vm_map_entry *prev;
 4661         vsize_t size;
 4662         int merged = 0;
 4663         bool copying;
 4664         int newetype;
 4665 
 4666         if (entry->aref.ar_amap != NULL) {
 4667                 return 0;
 4668         }
 4669         if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
 4670                 return 0;
 4671         }
 4672 
 4673         uobj = entry->object.uvm_obj;
 4674         size = entry->end - entry->start;
 4675         copying = (flags & UVM_MERGE_COPYING) != 0;
 4676         newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;
 4677 
 4678         next = entry->next;
 4679         if (next != &map->header &&
 4680             next->start == entry->end &&
 4681             ((copying && next->aref.ar_amap != NULL &&
 4682             amap_refs(next->aref.ar_amap) == 1) ||
 4683             (!copying && next->aref.ar_amap == NULL)) &&
 4684             UVM_ET_ISCOMPATIBLE(next, newetype,
 4685             uobj, entry->flags, entry->protection,
 4686             entry->max_protection, entry->inheritance, entry->advice,
 4687             entry->wired_count) &&
 4688             (uobj == NULL || entry->offset + size == next->offset)) {
 4689                 int error;
 4690 
 4691                 if (copying) {
 4692                         error = amap_extend(next, size,
 4693                             AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
 4694                 } else {
 4695                         error = 0;
 4696                 }
 4697                 if (error == 0) {
 4698                         if (uobj) {
 4699                                 if (uobj->pgops->pgo_detach) {
 4700                                         uobj->pgops->pgo_detach(uobj);
 4701                                 }
 4702                         }
 4703 
 4704                         entry->end = next->end;
 4705                         clear_hints(map, next);
 4706                         uvm_map_entry_unlink(map, next);
 4707                         if (copying) {
 4708                                 entry->aref = next->aref;
 4709                                 entry->etype &= ~UVM_ET_NEEDSCOPY;
 4710                         }
 4711                         uvm_map_check(map, "trymerge forwardmerge");
 4712                         uvm_mapent_free(next);
 4713                         merged++;
 4714                 }
 4715         }
 4716 
 4717         prev = entry->prev;
 4718         if (prev != &map->header &&
 4719             prev->end == entry->start &&
 4720             ((copying && !merged && prev->aref.ar_amap != NULL &&
 4721             amap_refs(prev->aref.ar_amap) == 1) ||
 4722             (!copying && prev->aref.ar_amap == NULL)) &&
 4723             UVM_ET_ISCOMPATIBLE(prev, newetype,
 4724             uobj, entry->flags, entry->protection,
 4725             entry->max_protection, entry->inheritance, entry->advice,
 4726             entry->wired_count) &&
 4727             (uobj == NULL ||
 4728             prev->offset + prev->end - prev->start == entry->offset)) {
 4729                 int error;
 4730 
 4731                 if (copying) {
 4732                         error = amap_extend(prev, size,
 4733                             AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
 4734                 } else {
 4735                         error = 0;
 4736                 }
 4737                 if (error == 0) {
 4738                         if (uobj) {
 4739                                 if (uobj->pgops->pgo_detach) {
 4740                                         uobj->pgops->pgo_detach(uobj);
 4741                                 }
 4742                                 entry->offset = prev->offset;
 4743                         }
 4744 
 4745                         entry->start = prev->start;
 4746                         clear_hints(map, prev);
 4747                         uvm_map_entry_unlink(map, prev);
 4748                         if (copying) {
 4749                                 entry->aref = prev->aref;
 4750                                 entry->etype &= ~UVM_ET_NEEDSCOPY;
 4751                         }
 4752                         uvm_map_check(map, "trymerge backmerge");
 4753                         uvm_mapent_free(prev);
 4754                         merged++;
 4755                 }
 4756         }
 4757 
 4758         return merged;
 4759 }
 4760 
 4761 /*
 4762  * uvm_map_setup: init map
 4763  *
 4764  * => map must not be in service yet.
 4765  */
 4766 
 4767 void
 4768 uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
 4769 {
 4770 
 4771         rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
 4772         map->header.next = map->header.prev = &map->header;
 4773         map->nentries = 0;
 4774         map->size = 0;
 4775         map->ref_count = 1;
 4776         vm_map_setmin(map, vmin);
 4777         vm_map_setmax(map, vmax);
 4778         map->flags = flags;
 4779         map->first_free = &map->header;
 4780         map->hint = &map->header;
 4781         map->timestamp = 0;
 4782         map->busy = NULL;
 4783 
 4784         rw_init(&map->lock);
 4785         cv_init(&map->cv, "vm_map");
 4786         mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
 4787 }
 4788 
 4789 /*
 4790  *   U N M A P   -   m a i n   e n t r y   p o i n t
 4791  */
 4792 
 4793 /*
 4794  * uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
 4795  *
 4796  * => caller must check alignment and size
 4797  * => map must be unlocked (we will lock it)
 4798  * => flags is UVM_FLAG_QUANTUM or 0.
 4799  */
 4800 
 4801 void
 4802 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
 4803 {
 4804         struct vm_map_entry *dead_entries;
 4805         UVMHIST_FUNC(__func__);
 4806         UVMHIST_CALLARGS(maphist, "  (map=%#jx, start=%#jx, end=%#jx)",
 4807             (uintptr_t)map, start, end, 0);
 4808 
 4809         KASSERTMSG(start < end,
 4810             "%s: map %p: start %#jx < end %#jx", __func__, map,
 4811             (uintmax_t)start, (uintmax_t)end);
 4812         if (map == kernel_map) {
 4813                 LOCKDEBUG_MEM_CHECK((void *)start, end - start);
 4814         }
 4815 
 4816         /*
 4817          * work now done by helper functions.   wipe the pmap's and then
 4818          * detach from the dead entries...
 4819          */
 4820         vm_map_lock(map);
 4821         uvm_unmap_remove(map, start, end, &dead_entries, flags);
 4822         vm_map_unlock(map);
 4823 
 4824         if (dead_entries != NULL)
 4825                 uvm_unmap_detach(dead_entries, 0);
 4826 
 4827         UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
 4828 }
 4829 
 4830 
 4831 /*
 4832  * uvm_map_reference: add reference to a map
 4833  *
 4834  * => map need not be locked
 4835  */
 4836 
 4837 void
 4838 uvm_map_reference(struct vm_map *map)
 4839 {
 4840 
 4841         atomic_inc_uint(&map->ref_count);
 4842 }
 4843 
 4844 void
 4845 uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op)
 4846 {
 4847 
 4848         if (entry->aref.ar_amap != NULL) {
 4849                 amap_lock(entry->aref.ar_amap, op);
 4850         }
 4851         if (UVM_ET_ISOBJ(entry)) {
 4852                 rw_enter(entry->object.uvm_obj->vmobjlock, op);
 4853         }
 4854 }
 4855 
 4856 void
 4857 uvm_map_unlock_entry(struct vm_map_entry *entry)
 4858 {
 4859 
 4860         if (UVM_ET_ISOBJ(entry)) {
 4861                 rw_exit(entry->object.uvm_obj->vmobjlock);
 4862         }
 4863         if (entry->aref.ar_amap != NULL) {
 4864                 amap_unlock(entry->aref.ar_amap);
 4865         }
 4866 }
 4867 
 4868 #define UVM_VOADDR_TYPE_MASK    0x3UL
 4869 #define UVM_VOADDR_TYPE_UOBJ    0x1UL
 4870 #define UVM_VOADDR_TYPE_ANON    0x2UL
 4871 #define UVM_VOADDR_OBJECT_MASK  ~UVM_VOADDR_TYPE_MASK
 4872 
 4873 #define UVM_VOADDR_GET_TYPE(voa)                                        \
 4874         ((voa)->object & UVM_VOADDR_TYPE_MASK)
 4875 #define UVM_VOADDR_GET_OBJECT(voa)                                      \
 4876         ((voa)->object & UVM_VOADDR_OBJECT_MASK)
 4877 #define UVM_VOADDR_SET_OBJECT(voa, obj, type)                           \
 4878 do {                                                                    \
 4879         KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0);        \
 4880         (voa)->object = ((uintptr_t)(obj)) | (type);                    \
 4881 } while (/*CONSTCOND*/0)
 4882 
 4883 #define UVM_VOADDR_GET_UOBJ(voa)                                        \
 4884         ((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa))
 4885 #define UVM_VOADDR_SET_UOBJ(voa, uobj)                                  \
 4886         UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ)
 4887 
 4888 #define UVM_VOADDR_GET_ANON(voa)                                        \
 4889         ((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa))
 4890 #define UVM_VOADDR_SET_ANON(voa, anon)                                  \
 4891         UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON)
 4892 
 4893 /*
 4894  * uvm_voaddr_acquire: returns the virtual object address corresponding
 4895  * to the specified virtual address.
 4896  *
 4897  * => resolves COW so the true page identity is tracked.
 4898  *
 4899  * => acquires a reference on the page's owner (uvm_object or vm_anon)
 4900  */
 4901 bool
 4902 uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va,
 4903     struct uvm_voaddr * const voaddr)
 4904 {
 4905         struct vm_map_entry *entry;
 4906         struct vm_anon *anon = NULL;
 4907         bool result = false;
 4908         bool exclusive = false;
 4909         void (*unlock_fn)(struct vm_map *);
 4910 
 4911         UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
 4912         UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0);
 4913 
 4914         const vaddr_t start = trunc_page(va);
 4915         const vaddr_t end = round_page(va+1);
 4916 
 4917  lookup_again:
 4918         if (__predict_false(exclusive)) {
 4919                 vm_map_lock(map);
 4920                 unlock_fn = vm_map_unlock;
 4921         } else {
 4922                 vm_map_lock_read(map);
 4923                 unlock_fn = vm_map_unlock_read;
 4924         }
 4925 
 4926         if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) {
 4927                 unlock_fn(map);
 4928                 UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0);
 4929                 return false;
 4930         }
 4931 
 4932         if (__predict_false(entry->protection == VM_PROT_NONE)) {
 4933                 unlock_fn(map);
 4934                 UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0);
 4935                 return false;
 4936         }
 4937 
 4938         /*
 4939          * We have a fast path for the common case of "no COW resolution
 4940          * needed" whereby we have taken a read lock on the map and if
 4941          * we don't encounter any need to create a vm_anon then great!
 4942          * But if we do, we loop around again, instead taking an exclusive
 4943          * lock so that we can perform the fault.
 4944          *
 4945          * In the event that we have to resolve the fault, we do nearly the
 4946          * same work as uvm_map_pageable() does:
 4947          *
 4948          * 1: holding the write lock, we create any anonymous maps that need
 4949          *    to be created.  however, we do NOT need to clip the map entries
 4950          *    in this case.
 4951          *
 4952          * 2: we downgrade to a read lock, and call uvm_fault_wire to fault
 4953          *    in the page (assuming the entry is not already wired).  this
 4954          *    is done because we need the vm_anon to be present.
 4955          */
 4956         if (__predict_true(!VM_MAPENT_ISWIRED(entry))) {
 4957 
 4958                 bool need_fault = false;
 4959 
 4960                 /*
 4961                  * perform the action of vm_map_lookup that need the
 4962                  * write lock on the map: create an anonymous map for
 4963                  * a copy-on-write region, or an anonymous map for
 4964                  * a zero-fill region.
 4965                  */
 4966                 if (__predict_false(UVM_ET_ISSUBMAP(entry))) {
 4967                         unlock_fn(map);
 4968                         UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0);
 4969                         return false;
 4970                 }
 4971                 if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) &&
 4972                     ((entry->max_protection & VM_PROT_WRITE) ||
 4973                      (entry->object.uvm_obj == NULL)))) {
 4974                         if (!exclusive) {
 4975                                 /* need to take the slow path */
 4976                                 KASSERT(unlock_fn == vm_map_unlock_read);
 4977                                 vm_map_unlock_read(map);
 4978                                 exclusive = true;
 4979                                 goto lookup_again;
 4980                         }
 4981                         need_fault = true;
 4982                         amap_copy(map, entry, 0, start, end);
 4983                         /* XXXCDC: wait OK? */
 4984                 }
 4985 
 4986                 /*
 4987                  * do a quick check to see if the fault has already
 4988                  * been resolved to the upper layer.
 4989                  */
 4990                 if (__predict_true(entry->aref.ar_amap != NULL &&
 4991                                    need_fault == false)) {
 4992                         amap_lock(entry->aref.ar_amap, RW_WRITER);
 4993                         anon = amap_lookup(&entry->aref, start - entry->start);
 4994                         if (__predict_true(anon != NULL)) {
 4995                                 /* amap unlocked below */
 4996                                 goto found_anon;
 4997                         }
 4998                         amap_unlock(entry->aref.ar_amap);
 4999                         need_fault = true;
 5000                 }
 5001 
 5002                 /*
 5003                  * we predict this test as false because if we reach
 5004                  * this point, then we are likely dealing with a
 5005                  * shared memory region backed by a uvm_object, in
 5006                  * which case a fault to create the vm_anon is not
 5007                  * necessary.
 5008                  */
 5009                 if (__predict_false(need_fault)) {
 5010                         if (exclusive) {
 5011                                 vm_map_busy(map);
 5012                                 vm_map_unlock(map);
 5013                                 unlock_fn = vm_map_unbusy;
 5014                         }
 5015 
 5016                         if (uvm_fault_wire(map, start, end,
 5017                                            entry->max_protection, 1)) {
 5018                                 /* wiring failed */
 5019                                 unlock_fn(map);
 5020                                 UVMHIST_LOG(maphist,"<- done (wire failed)",
 5021                                             0,0,0,0);
 5022                                 return false;
 5023                         }
 5024 
 5025                         /*
 5026                          * now that we have resolved the fault, we can unwire
 5027                          * the page.
 5028                          */
 5029                         if (exclusive) {
 5030                                 vm_map_lock(map);
 5031                                 vm_map_unbusy(map);
 5032                                 unlock_fn = vm_map_unlock;
 5033                         }
 5034 
 5035                         uvm_fault_unwire_locked(map, start, end);
 5036                 }
 5037         }
 5038 
 5039         /* check the upper layer */
 5040         if (entry->aref.ar_amap) {
 5041                 amap_lock(entry->aref.ar_amap, RW_WRITER);
 5042                 anon = amap_lookup(&entry->aref, start - entry->start);
 5043                 if (anon) {
 5044  found_anon:            KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock);
 5045                         anon->an_ref++;
 5046                         rw_obj_hold(anon->an_lock);
 5047                         KASSERT(anon->an_ref != 0);
 5048                         UVM_VOADDR_SET_ANON(voaddr, anon);
 5049                         voaddr->offset = va & PAGE_MASK;
 5050                         result = true;
 5051                 }
 5052                 amap_unlock(entry->aref.ar_amap);
 5053         }
 5054 
 5055         /* check the lower layer */
 5056         if (!result && UVM_ET_ISOBJ(entry)) {
 5057                 struct uvm_object *uobj = entry->object.uvm_obj;
 5058 
 5059                 KASSERT(uobj != NULL);
 5060                 (*uobj->pgops->pgo_reference)(uobj);
 5061                 UVM_VOADDR_SET_UOBJ(voaddr, uobj);
 5062                 voaddr->offset = entry->offset + (va - entry->start);
 5063                 result = true;
 5064         }
 5065 
 5066         unlock_fn(map);
 5067 
 5068         if (result) {
 5069                 UVMHIST_LOG(maphist,
 5070                     "<- done OK (type=%jd,owner=%#jx,offset=%#jx)",
 5071                     UVM_VOADDR_GET_TYPE(voaddr),
 5072                     UVM_VOADDR_GET_OBJECT(voaddr),
 5073                     voaddr->offset, 0);
 5074         } else {
 5075                 UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0);
 5076         }
 5077 
 5078         return result;
 5079 }
 5080 
 5081 /*
 5082  * uvm_voaddr_release: release the references held by the
 5083  * vitual object address.
 5084  */
 5085 void
 5086 uvm_voaddr_release(struct uvm_voaddr * const voaddr)
 5087 {
 5088 
 5089         switch (UVM_VOADDR_GET_TYPE(voaddr)) {
 5090         case UVM_VOADDR_TYPE_UOBJ: {
 5091                 struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr);
 5092 
 5093                 KASSERT(uobj != NULL);
 5094                 KASSERT(uobj->pgops->pgo_detach != NULL);
 5095                 (*uobj->pgops->pgo_detach)(uobj);
 5096                 break;
 5097             }
 5098         case UVM_VOADDR_TYPE_ANON: {
 5099                 struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr);
 5100                 krwlock_t *lock;
 5101 
 5102                 KASSERT(anon != NULL);
 5103                 rw_enter((lock = anon->an_lock), RW_WRITER);
 5104                 KASSERT(anon->an_ref > 0);
 5105                 if (--anon->an_ref == 0) {
 5106                         uvm_anfree(anon);
 5107                 }
 5108                 rw_exit(lock);
 5109                 rw_obj_free(lock);
 5110                 break;
 5111             }
 5112         default:
 5113                 panic("uvm_voaddr_release: bad type");
 5114         }
 5115         memset(voaddr, 0, sizeof(*voaddr));
 5116 }
 5117 
 5118 /*
 5119  * uvm_voaddr_compare: compare two uvm_voaddr objects.
 5120  *
 5121  * => memcmp() semantics
 5122  */
 5123 int
 5124 uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,
 5125     const struct uvm_voaddr * const voaddr2)
 5126 {
 5127         const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1);
 5128         const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2);
 5129 
 5130         KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ ||
 5131                 type1 == UVM_VOADDR_TYPE_ANON);
 5132 
 5133         KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ ||
 5134                 type2 == UVM_VOADDR_TYPE_ANON);
 5135 
 5136         if (type1 < type2)
 5137                 return -1;
 5138         if (type1 > type2)
 5139                 return 1;
 5140 
 5141         const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1);
 5142         const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2);
 5143 
 5144         if (addr1 < addr2)
 5145                 return -1;
 5146         if (addr1 > addr2)
 5147                 return 1;
 5148 
 5149         if (voaddr1->offset < voaddr2->offset)
 5150                 return -1;
 5151         if (voaddr1->offset > voaddr2->offset)
 5152                 return 1;
 5153 
 5154         return 0;
 5155 }
 5156 
 5157 #if defined(DDB) || defined(DEBUGPRINT)
 5158 
 5159 /*
 5160  * uvm_map_printit: actually prints the map
 5161  */
 5162 
 5163 void
 5164 uvm_map_printit(struct vm_map *map, bool full,
 5165     void (*pr)(const char *, ...))
 5166 {
 5167         struct vm_map_entry *entry;
 5168 
 5169         (*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
 5170             vm_map_max(map));
 5171         (*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
 5172             map->nentries, map->size, map->ref_count, map->timestamp,
 5173             map->flags);
 5174         (*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
 5175             pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
 5176         if (!full)
 5177                 return;
 5178         for (entry = map->header.next; entry != &map->header;
 5179             entry = entry->next) {
 5180                 (*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
 5181                     entry, entry->start, entry->end, entry->object.uvm_obj,
 5182                     (long long)entry->offset, entry->aref.ar_amap,
 5183                     entry->aref.ar_pageoff);
 5184                 (*pr)(
 5185                     "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
 5186                     "wc=%d, adv=%d%s\n",
 5187                     (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
 5188                     (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
 5189                     (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
 5190                     entry->protection, entry->max_protection,
 5191                     entry->inheritance, entry->wired_count, entry->advice,
 5192                     entry == map->first_free ? " (first_free)" : "");
 5193         }
 5194 }
 5195 
 5196 void
 5197 uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
 5198 {
 5199         struct vm_map *map;
 5200 
 5201         for (map = kernel_map;;) {
 5202                 struct vm_map_entry *entry;
 5203 
 5204                 if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
 5205                         break;
 5206                 }
 5207                 (*pr)("%p is %p+%zu from VMMAP %p\n",
 5208                     (void *)addr, (void *)entry->start,
 5209                     (size_t)(addr - (uintptr_t)entry->start), map);
 5210                 if (!UVM_ET_ISSUBMAP(entry)) {
 5211                         break;
 5212                 }
 5213                 map = entry->object.sub_map;
 5214         }
 5215 }
 5216 
 5217 #endif /* DDB || DEBUGPRINT */
 5218 
 5219 #ifndef __USER_VA0_IS_SAFE
 5220 static int
 5221 sysctl_user_va0_disable(SYSCTLFN_ARGS)
 5222 {
 5223         struct sysctlnode node;
 5224         int t, error;
 5225 
 5226         node = *rnode;
 5227         node.sysctl_data = &t;
 5228         t = user_va0_disable;
 5229         error = sysctl_lookup(SYSCTLFN_CALL(&node));
 5230         if (error || newp == NULL)
 5231                 return (error);
 5232 
 5233         if (!t && user_va0_disable &&
 5234             kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
 5235             NULL, NULL, NULL))
 5236                 return EPERM;
 5237 
 5238         user_va0_disable = !!t;
 5239         return 0;
 5240 }
 5241 #endif
 5242 
 5243 static int
 5244 fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
 5245     struct vm_map *m, struct vm_map_entry *e)
 5246 {
 5247 #ifndef _RUMPKERNEL
 5248         int error;
 5249 
 5250         memset(kve, 0, sizeof(*kve));
 5251         KASSERT(e != NULL);
 5252         if (UVM_ET_ISOBJ(e)) {
 5253                 struct uvm_object *uobj = e->object.uvm_obj;
 5254                 KASSERT(uobj != NULL);
 5255                 kve->kve_ref_count = uobj->uo_refs;
 5256                 kve->kve_count = uobj->uo_npages;
 5257                 if (UVM_OBJ_IS_VNODE(uobj)) {
 5258                         struct vattr va;
 5259                         struct vnode *vp = (struct vnode *)uobj;
 5260                         vn_lock(vp, LK_SHARED | LK_RETRY);
 5261                         error = VOP_GETATTR(vp, &va, l->l_cred);
 5262                         VOP_UNLOCK(vp);
 5263                         kve->kve_type = KVME_TYPE_VNODE;
 5264                         if (error == 0) {
 5265                                 kve->kve_vn_size = vp->v_size;
 5266                                 kve->kve_vn_type = (int)vp->v_type;
 5267                                 kve->kve_vn_mode = va.va_mode;
 5268                                 kve->kve_vn_rdev = va.va_rdev;
 5269                                 kve->kve_vn_fileid = va.va_fileid;
 5270                                 kve->kve_vn_fsid = va.va_fsid;
 5271                                 error = vnode_to_path(kve->kve_path,
 5272                                     sizeof(kve->kve_path) / 2, vp, l, p);
 5273                         }
 5274                 } else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
 5275                         kve->kve_type = KVME_TYPE_KERN;
 5276                 } else if (UVM_OBJ_IS_DEVICE(uobj)) {
 5277                         kve->kve_type = KVME_TYPE_DEVICE;
 5278                 } else if (UVM_OBJ_IS_AOBJ(uobj)) {
 5279                         kve->kve_type = KVME_TYPE_ANON;
 5280                 } else {
 5281                         kve->kve_type = KVME_TYPE_OBJECT;
 5282                 }
 5283         } else if (UVM_ET_ISSUBMAP(e)) {
 5284                 struct vm_map *map = e->object.sub_map;
 5285                 KASSERT(map != NULL);
 5286                 kve->kve_ref_count = map->ref_count;
 5287                 kve->kve_count = map->nentries;
 5288                 kve->kve_type = KVME_TYPE_SUBMAP;
 5289         } else
 5290                 kve->kve_type = KVME_TYPE_UNKNOWN;
 5291 
 5292         kve->kve_start = e->start;
 5293         kve->kve_end = e->end;
 5294         kve->kve_offset = e->offset;
 5295         kve->kve_wired_count = e->wired_count;
 5296         kve->kve_inheritance = e->inheritance;
 5297         kve->kve_attributes = 0; /* unused */
 5298         kve->kve_advice = e->advice;
 5299 #define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
 5300         (((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
 5301         (((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
 5302         kve->kve_protection = PROT(e->protection);
 5303         kve->kve_max_protection = PROT(e->max_protection);
 5304         kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
 5305             ? KVME_FLAG_COW : 0;
 5306         kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
 5307             ? KVME_FLAG_NEEDS_COPY : 0;
 5308         kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
 5309             ? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
 5310         kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
 5311             ? KVME_FLAG_PAGEABLE : 0;
 5312 #endif
 5313         return 0;
 5314 }
 5315 
 5316 static int
 5317 fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
 5318     size_t *oldlenp)
 5319 {
 5320         int error;
 5321         struct proc *p;
 5322         struct kinfo_vmentry *vme;
 5323         struct vmspace *vm;
 5324         struct vm_map *map;
 5325         struct vm_map_entry *entry;
 5326         char *dp;
 5327         size_t count, vmesize;
 5328 
 5329         if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
 5330                 return EINVAL;
 5331 
 5332         if (oldp) {
 5333                 if (*oldlenp > 10UL * 1024UL * 1024UL)
 5334                         return E2BIG;
 5335                 count = *oldlenp / elem_size;
 5336                 if (count == 0)
 5337                         return ENOMEM;
 5338                 vmesize = count * sizeof(*vme);
 5339         } else
 5340                 vmesize = 0;
 5341 
 5342         if ((error = proc_find_locked(l, &p, pid)) != 0)
 5343                 return error;
 5344 
 5345         vme = NULL;
 5346         count = 0;
 5347 
 5348         if ((error = proc_vmspace_getref(p, &vm)) != 0)
 5349                 goto out;
 5350 
 5351         map = &vm->vm_map;
 5352         vm_map_lock_read(map);
 5353 
 5354         dp = oldp;
 5355         if (oldp)
 5356                 vme = kmem_alloc(vmesize, KM_SLEEP);
 5357         for (entry = map->header.next; entry != &map->header;
 5358             entry = entry->next) {
 5359                 if (oldp && (dp - (char *)oldp) < vmesize) {
 5360                         error = fill_vmentry(l, p, &vme[count], map, entry);
 5361                         if (error)
 5362                                 goto out;
 5363                         dp += elem_size;
 5364                 }
 5365                 count++;
 5366         }
 5367         vm_map_unlock_read(map);
 5368         uvmspace_free(vm);
 5369 
 5370 out:
 5371         if (pid != -1)
 5372                 mutex_exit(p->p_lock);
 5373         if (error == 0) {
 5374                 const u_int esize = uimin(sizeof(*vme), elem_size);
 5375                 dp = oldp;
 5376                 for (size_t i = 0; i < count; i++) {
 5377                         if (oldp && (dp - (char *)oldp) < vmesize) {
 5378                                 error = sysctl_copyout(l, &vme[i], dp, esize);
 5379                                 if (error)
 5380                                         break;
 5381                                 dp += elem_size;
 5382                         } else
 5383                                 break;
 5384                 }
 5385                 count *= elem_size;
 5386                 if (oldp != NULL && *oldlenp < count)
 5387                         error = ENOSPC;
 5388                 *oldlenp = count;
 5389         }
 5390         if (vme)
 5391                 kmem_free(vme, vmesize);
 5392         return error;
 5393 }
 5394 
 5395 static int
 5396 sysctl_vmproc(SYSCTLFN_ARGS)
 5397 {
 5398         int error;
 5399 
 5400         if (namelen == 1 && name[0] == CTL_QUERY)
 5401                 return (sysctl_query(SYSCTLFN_CALL(rnode)));
 5402 
 5403         if (namelen == 0)
 5404                 return EINVAL;
 5405 
 5406         switch (name[0]) {
 5407         case VM_PROC_MAP:
 5408                 if (namelen != 3)
 5409                         return EINVAL;
 5410                 sysctl_unlock();
 5411                 error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
 5412                 sysctl_relock();
 5413                 return error;
 5414         default:
 5415                 return EINVAL;
 5416         }
 5417 }
 5418 
 5419 SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
 5420 {
 5421 
 5422         sysctl_createv(clog, 0, NULL, NULL,
 5423                        CTLFLAG_PERMANENT,
 5424                        CTLTYPE_STRUCT, "proc",
 5425                        SYSCTL_DESCR("Process vm information"),
 5426                        sysctl_vmproc, 0, NULL, 0,
 5427                        CTL_VM, VM_PROC, CTL_EOL);
 5428 #ifndef __USER_VA0_IS_SAFE
 5429         sysctl_createv(clog, 0, NULL, NULL,
 5430                        CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
 5431                        CTLTYPE_INT, "user_va0_disable",
 5432                        SYSCTL_DESCR("Disable VA 0"),
 5433                        sysctl_user_va0_disable, 0, &user_va0_disable, 0,
 5434                        CTL_VM, CTL_CREATE, CTL_EOL);
 5435 #endif
 5436 }

Cache object: ae0606ffdcf4d9dd38a74cb3a6460d89


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.