FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_map.c
1 /*-
2 * SPDX-License-Identifier: (BSD-3-Clause AND MIT-CMU)
3 *
4 * Copyright (c) 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * The Mach Operating System project at Carnegie-Mellon University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: @(#)vm_map.c 8.3 (Berkeley) 1/12/94
35 *
36 *
37 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
38 * All rights reserved.
39 *
40 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
55 * School of Computer Science
56 * Carnegie Mellon University
57 * Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63 /*
64 * Virtual memory mapping module.
65 */
66
67 #include <sys/cdefs.h>
68 __FBSDID("$FreeBSD$");
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/elf.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/lock.h>
76 #include <sys/mutex.h>
77 #include <sys/proc.h>
78 #include <sys/vmmeter.h>
79 #include <sys/mman.h>
80 #include <sys/vnode.h>
81 #include <sys/racct.h>
82 #include <sys/resourcevar.h>
83 #include <sys/rwlock.h>
84 #include <sys/file.h>
85 #include <sys/sysctl.h>
86 #include <sys/sysent.h>
87 #include <sys/shm.h>
88
89 #include <vm/vm.h>
90 #include <vm/vm_param.h>
91 #include <vm/pmap.h>
92 #include <vm/vm_map.h>
93 #include <vm/vm_page.h>
94 #include <vm/vm_pageout.h>
95 #include <vm/vm_object.h>
96 #include <vm/vm_pager.h>
97 #include <vm/vm_kern.h>
98 #include <vm/vm_extern.h>
99 #include <vm/vnode_pager.h>
100 #include <vm/swap_pager.h>
101 #include <vm/uma.h>
102
103 /*
104 * Virtual memory maps provide for the mapping, protection,
105 * and sharing of virtual memory objects. In addition,
106 * this module provides for an efficient virtual copy of
107 * memory from one map to another.
108 *
109 * Synchronization is required prior to most operations.
110 *
111 * Maps consist of an ordered doubly-linked list of simple
112 * entries; a self-adjusting binary search tree of these
113 * entries is used to speed up lookups.
114 *
115 * Since portions of maps are specified by start/end addresses,
116 * which may not align with existing map entries, all
117 * routines merely "clip" entries to these start/end values.
118 * [That is, an entry is split into two, bordering at a
119 * start or end value.] Note that these clippings may not
120 * always be necessary (as the two resulting entries are then
121 * not changed); however, the clipping is done for convenience.
122 *
123 * As mentioned above, virtual copy operations are performed
124 * by copying VM object references from one map to
125 * another, and then marking both regions as copy-on-write.
126 */
127
128 static struct mtx map_sleep_mtx;
129 static uma_zone_t mapentzone;
130 static uma_zone_t kmapentzone;
131 static uma_zone_t vmspace_zone;
132 static int vmspace_zinit(void *mem, int size, int flags);
133 static void _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min,
134 vm_offset_t max);
135 static void vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map);
136 static void vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry);
137 static void vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry);
138 static int vm_map_growstack(vm_map_t map, vm_offset_t addr,
139 vm_map_entry_t gap_entry);
140 static void vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
141 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags);
142 #ifdef INVARIANTS
143 static void vmspace_zdtor(void *mem, int size, void *arg);
144 #endif
145 static int vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos,
146 vm_size_t max_ssize, vm_size_t growsize, vm_prot_t prot, vm_prot_t max,
147 int cow);
148 static void vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
149 vm_offset_t failed_addr);
150
151 #define ENTRY_CHARGED(e) ((e)->cred != NULL || \
152 ((e)->object.vm_object != NULL && (e)->object.vm_object->cred != NULL && \
153 !((e)->eflags & MAP_ENTRY_NEEDS_COPY)))
154
155 /*
156 * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
157 * stable.
158 */
159 #define PROC_VMSPACE_LOCK(p) do { } while (0)
160 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
161
162 /*
163 * VM_MAP_RANGE_CHECK: [ internal use only ]
164 *
165 * Asserts that the starting and ending region
166 * addresses fall within the valid range of the map.
167 */
168 #define VM_MAP_RANGE_CHECK(map, start, end) \
169 { \
170 if (start < vm_map_min(map)) \
171 start = vm_map_min(map); \
172 if (end > vm_map_max(map)) \
173 end = vm_map_max(map); \
174 if (start > end) \
175 start = end; \
176 }
177
178 #ifndef UMA_MD_SMALL_ALLOC
179
180 /*
181 * Allocate a new slab for kernel map entries. The kernel map may be locked or
182 * unlocked, depending on whether the request is coming from the kernel map or a
183 * submap. This function allocates a virtual address range directly from the
184 * kernel map instead of the kmem_* layer to avoid recursion on the kernel map
185 * lock and also to avoid triggering allocator recursion in the vmem boundary
186 * tag allocator.
187 */
188 static void *
189 kmapent_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
190 int wait)
191 {
192 vm_offset_t addr;
193 int error, locked;
194
195 *pflag = UMA_SLAB_PRIV;
196
197 if (!(locked = vm_map_locked(kernel_map)))
198 vm_map_lock(kernel_map);
199 addr = vm_map_findspace(kernel_map, vm_map_min(kernel_map), bytes);
200 if (addr + bytes < addr || addr + bytes > vm_map_max(kernel_map))
201 panic("%s: kernel map is exhausted", __func__);
202 error = vm_map_insert(kernel_map, NULL, 0, addr, addr + bytes,
203 VM_PROT_RW, VM_PROT_RW, MAP_NOFAULT);
204 if (error != KERN_SUCCESS)
205 panic("%s: vm_map_insert() failed: %d", __func__, error);
206 if (!locked)
207 vm_map_unlock(kernel_map);
208 error = kmem_back_domain(domain, kernel_object, addr, bytes, M_NOWAIT |
209 M_USE_RESERVE | (wait & M_ZERO));
210 if (error == KERN_SUCCESS) {
211 return ((void *)addr);
212 } else {
213 if (!locked)
214 vm_map_lock(kernel_map);
215 vm_map_delete(kernel_map, addr, bytes);
216 if (!locked)
217 vm_map_unlock(kernel_map);
218 return (NULL);
219 }
220 }
221
222 static void
223 kmapent_free(void *item, vm_size_t size, uint8_t pflag)
224 {
225 vm_offset_t addr;
226 int error;
227
228 if ((pflag & UMA_SLAB_PRIV) == 0)
229 /* XXX leaked */
230 return;
231
232 addr = (vm_offset_t)item;
233 kmem_unback(kernel_object, addr, size);
234 error = vm_map_remove(kernel_map, addr, addr + size);
235 KASSERT(error == KERN_SUCCESS,
236 ("%s: vm_map_remove failed: %d", __func__, error));
237 }
238
239 /*
240 * The worst-case upper bound on the number of kernel map entries that may be
241 * created before the zone must be replenished in _vm_map_unlock().
242 */
243 #define KMAPENT_RESERVE 1
244
245 #endif /* !UMD_MD_SMALL_ALLOC */
246
247 /*
248 * vm_map_startup:
249 *
250 * Initialize the vm_map module. Must be called before any other vm_map
251 * routines.
252 *
253 * User map and entry structures are allocated from the general purpose
254 * memory pool. Kernel maps are statically defined. Kernel map entries
255 * require special handling to avoid recursion; see the comments above
256 * kmapent_alloc() and in vm_map_entry_create().
257 */
258 void
259 vm_map_startup(void)
260 {
261 mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
262
263 /*
264 * Disable the use of per-CPU buckets: map entry allocation is
265 * serialized by the kernel map lock.
266 */
267 kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
268 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
269 UMA_ZONE_VM | UMA_ZONE_NOBUCKET);
270 #ifndef UMA_MD_SMALL_ALLOC
271 /* Reserve an extra map entry for use when replenishing the reserve. */
272 uma_zone_reserve(kmapentzone, KMAPENT_RESERVE + 1);
273 uma_prealloc(kmapentzone, KMAPENT_RESERVE + 1);
274 uma_zone_set_allocf(kmapentzone, kmapent_alloc);
275 uma_zone_set_freef(kmapentzone, kmapent_free);
276 #endif
277
278 mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
279 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
280 vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
281 #ifdef INVARIANTS
282 vmspace_zdtor,
283 #else
284 NULL,
285 #endif
286 vmspace_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
287 }
288
289 static int
290 vmspace_zinit(void *mem, int size, int flags)
291 {
292 struct vmspace *vm;
293 vm_map_t map;
294
295 vm = (struct vmspace *)mem;
296 map = &vm->vm_map;
297
298 memset(map, 0, sizeof(*map));
299 mtx_init(&map->system_mtx, "vm map (system)", NULL,
300 MTX_DEF | MTX_DUPOK);
301 sx_init(&map->lock, "vm map (user)");
302 PMAP_LOCK_INIT(vmspace_pmap(vm));
303 return (0);
304 }
305
306 #ifdef INVARIANTS
307 static void
308 vmspace_zdtor(void *mem, int size, void *arg)
309 {
310 struct vmspace *vm;
311
312 vm = (struct vmspace *)mem;
313 KASSERT(vm->vm_map.nentries == 0,
314 ("vmspace %p nentries == %d on free", vm, vm->vm_map.nentries));
315 KASSERT(vm->vm_map.size == 0,
316 ("vmspace %p size == %ju on free", vm, (uintmax_t)vm->vm_map.size));
317 }
318 #endif /* INVARIANTS */
319
320 /*
321 * Allocate a vmspace structure, including a vm_map and pmap,
322 * and initialize those structures. The refcnt is set to 1.
323 */
324 struct vmspace *
325 vmspace_alloc(vm_offset_t min, vm_offset_t max, pmap_pinit_t pinit)
326 {
327 struct vmspace *vm;
328
329 vm = uma_zalloc(vmspace_zone, M_WAITOK);
330 KASSERT(vm->vm_map.pmap == NULL, ("vm_map.pmap must be NULL"));
331 if (!pinit(vmspace_pmap(vm))) {
332 uma_zfree(vmspace_zone, vm);
333 return (NULL);
334 }
335 CTR1(KTR_VM, "vmspace_alloc: %p", vm);
336 _vm_map_init(&vm->vm_map, vmspace_pmap(vm), min, max);
337 refcount_init(&vm->vm_refcnt, 1);
338 vm->vm_shm = NULL;
339 vm->vm_swrss = 0;
340 vm->vm_tsize = 0;
341 vm->vm_dsize = 0;
342 vm->vm_ssize = 0;
343 vm->vm_taddr = 0;
344 vm->vm_daddr = 0;
345 vm->vm_maxsaddr = 0;
346 return (vm);
347 }
348
349 #ifdef RACCT
350 static void
351 vmspace_container_reset(struct proc *p)
352 {
353
354 PROC_LOCK(p);
355 racct_set(p, RACCT_DATA, 0);
356 racct_set(p, RACCT_STACK, 0);
357 racct_set(p, RACCT_RSS, 0);
358 racct_set(p, RACCT_MEMLOCK, 0);
359 racct_set(p, RACCT_VMEM, 0);
360 PROC_UNLOCK(p);
361 }
362 #endif
363
364 static inline void
365 vmspace_dofree(struct vmspace *vm)
366 {
367
368 CTR1(KTR_VM, "vmspace_free: %p", vm);
369
370 /*
371 * Make sure any SysV shm is freed, it might not have been in
372 * exit1().
373 */
374 shmexit(vm);
375
376 /*
377 * Lock the map, to wait out all other references to it.
378 * Delete all of the mappings and pages they hold, then call
379 * the pmap module to reclaim anything left.
380 */
381 (void)vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
382 vm_map_max(&vm->vm_map));
383
384 pmap_release(vmspace_pmap(vm));
385 vm->vm_map.pmap = NULL;
386 uma_zfree(vmspace_zone, vm);
387 }
388
389 void
390 vmspace_free(struct vmspace *vm)
391 {
392
393 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
394 "vmspace_free() called");
395
396 if (refcount_release(&vm->vm_refcnt))
397 vmspace_dofree(vm);
398 }
399
400 void
401 vmspace_exitfree(struct proc *p)
402 {
403 struct vmspace *vm;
404
405 PROC_VMSPACE_LOCK(p);
406 vm = p->p_vmspace;
407 p->p_vmspace = NULL;
408 PROC_VMSPACE_UNLOCK(p);
409 KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
410 vmspace_free(vm);
411 }
412
413 void
414 vmspace_exit(struct thread *td)
415 {
416 struct vmspace *vm;
417 struct proc *p;
418 bool released;
419
420 p = td->td_proc;
421 vm = p->p_vmspace;
422
423 /*
424 * Prepare to release the vmspace reference. The thread that releases
425 * the last reference is responsible for tearing down the vmspace.
426 * However, threads not releasing the final reference must switch to the
427 * kernel's vmspace0 before the decrement so that the subsequent pmap
428 * deactivation does not modify a freed vmspace.
429 */
430 refcount_acquire(&vmspace0.vm_refcnt);
431 if (!(released = refcount_release_if_last(&vm->vm_refcnt))) {
432 if (p->p_vmspace != &vmspace0) {
433 PROC_VMSPACE_LOCK(p);
434 p->p_vmspace = &vmspace0;
435 PROC_VMSPACE_UNLOCK(p);
436 pmap_activate(td);
437 }
438 released = refcount_release(&vm->vm_refcnt);
439 }
440 if (released) {
441 /*
442 * pmap_remove_pages() expects the pmap to be active, so switch
443 * back first if necessary.
444 */
445 if (p->p_vmspace != vm) {
446 PROC_VMSPACE_LOCK(p);
447 p->p_vmspace = vm;
448 PROC_VMSPACE_UNLOCK(p);
449 pmap_activate(td);
450 }
451 pmap_remove_pages(vmspace_pmap(vm));
452 PROC_VMSPACE_LOCK(p);
453 p->p_vmspace = &vmspace0;
454 PROC_VMSPACE_UNLOCK(p);
455 pmap_activate(td);
456 vmspace_dofree(vm);
457 }
458 #ifdef RACCT
459 if (racct_enable)
460 vmspace_container_reset(p);
461 #endif
462 }
463
464 /* Acquire reference to vmspace owned by another process. */
465
466 struct vmspace *
467 vmspace_acquire_ref(struct proc *p)
468 {
469 struct vmspace *vm;
470
471 PROC_VMSPACE_LOCK(p);
472 vm = p->p_vmspace;
473 if (vm == NULL || !refcount_acquire_if_not_zero(&vm->vm_refcnt)) {
474 PROC_VMSPACE_UNLOCK(p);
475 return (NULL);
476 }
477 if (vm != p->p_vmspace) {
478 PROC_VMSPACE_UNLOCK(p);
479 vmspace_free(vm);
480 return (NULL);
481 }
482 PROC_VMSPACE_UNLOCK(p);
483 return (vm);
484 }
485
486 /*
487 * Switch between vmspaces in an AIO kernel process.
488 *
489 * The new vmspace is either the vmspace of a user process obtained
490 * from an active AIO request or the initial vmspace of the AIO kernel
491 * process (when it is idling). Because user processes will block to
492 * drain any active AIO requests before proceeding in exit() or
493 * execve(), the reference count for vmspaces from AIO requests can
494 * never be 0. Similarly, AIO kernel processes hold an extra
495 * reference on their initial vmspace for the life of the process. As
496 * a result, the 'newvm' vmspace always has a non-zero reference
497 * count. This permits an additional reference on 'newvm' to be
498 * acquired via a simple atomic increment rather than the loop in
499 * vmspace_acquire_ref() above.
500 */
501 void
502 vmspace_switch_aio(struct vmspace *newvm)
503 {
504 struct vmspace *oldvm;
505
506 /* XXX: Need some way to assert that this is an aio daemon. */
507
508 KASSERT(refcount_load(&newvm->vm_refcnt) > 0,
509 ("vmspace_switch_aio: newvm unreferenced"));
510
511 oldvm = curproc->p_vmspace;
512 if (oldvm == newvm)
513 return;
514
515 /*
516 * Point to the new address space and refer to it.
517 */
518 curproc->p_vmspace = newvm;
519 refcount_acquire(&newvm->vm_refcnt);
520
521 /* Activate the new mapping. */
522 pmap_activate(curthread);
523
524 vmspace_free(oldvm);
525 }
526
527 void
528 _vm_map_lock(vm_map_t map, const char *file, int line)
529 {
530
531 if (map->system_map)
532 mtx_lock_flags_(&map->system_mtx, 0, file, line);
533 else
534 sx_xlock_(&map->lock, file, line);
535 map->timestamp++;
536 }
537
538 void
539 vm_map_entry_set_vnode_text(vm_map_entry_t entry, bool add)
540 {
541 vm_object_t object;
542 struct vnode *vp;
543 bool vp_held;
544
545 if ((entry->eflags & MAP_ENTRY_VN_EXEC) == 0)
546 return;
547 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
548 ("Submap with execs"));
549 object = entry->object.vm_object;
550 KASSERT(object != NULL, ("No object for text, entry %p", entry));
551 if ((object->flags & OBJ_ANON) != 0)
552 object = object->handle;
553 else
554 KASSERT(object->backing_object == NULL,
555 ("non-anon object %p shadows", object));
556 KASSERT(object != NULL, ("No content object for text, entry %p obj %p",
557 entry, entry->object.vm_object));
558
559 /*
560 * Mostly, we do not lock the backing object. It is
561 * referenced by the entry we are processing, so it cannot go
562 * away.
563 */
564 vp = NULL;
565 vp_held = false;
566 if (object->type == OBJT_DEAD) {
567 /*
568 * For OBJT_DEAD objects, v_writecount was handled in
569 * vnode_pager_dealloc().
570 */
571 } else if (object->type == OBJT_VNODE) {
572 vp = object->handle;
573 } else if (object->type == OBJT_SWAP) {
574 KASSERT((object->flags & OBJ_TMPFS_NODE) != 0,
575 ("vm_map_entry_set_vnode_text: swap and !TMPFS "
576 "entry %p, object %p, add %d", entry, object, add));
577 /*
578 * Tmpfs VREG node, which was reclaimed, has
579 * OBJ_TMPFS_NODE flag set, but not OBJ_TMPFS. In
580 * this case there is no v_writecount to adjust.
581 */
582 VM_OBJECT_RLOCK(object);
583 if ((object->flags & OBJ_TMPFS) != 0) {
584 vp = object->un_pager.swp.swp_tmpfs;
585 if (vp != NULL) {
586 vhold(vp);
587 vp_held = true;
588 }
589 }
590 VM_OBJECT_RUNLOCK(object);
591 } else {
592 KASSERT(0,
593 ("vm_map_entry_set_vnode_text: wrong object type, "
594 "entry %p, object %p, add %d", entry, object, add));
595 }
596 if (vp != NULL) {
597 if (add) {
598 VOP_SET_TEXT_CHECKED(vp);
599 } else {
600 vn_lock(vp, LK_SHARED | LK_RETRY);
601 VOP_UNSET_TEXT_CHECKED(vp);
602 VOP_UNLOCK(vp);
603 }
604 if (vp_held)
605 vdrop(vp);
606 }
607 }
608
609 /*
610 * Use a different name for this vm_map_entry field when it's use
611 * is not consistent with its use as part of an ordered search tree.
612 */
613 #define defer_next right
614
615 static void
616 vm_map_process_deferred(void)
617 {
618 struct thread *td;
619 vm_map_entry_t entry, next;
620 vm_object_t object;
621
622 td = curthread;
623 entry = td->td_map_def_user;
624 td->td_map_def_user = NULL;
625 while (entry != NULL) {
626 next = entry->defer_next;
627 MPASS((entry->eflags & (MAP_ENTRY_WRITECNT |
628 MAP_ENTRY_VN_EXEC)) != (MAP_ENTRY_WRITECNT |
629 MAP_ENTRY_VN_EXEC));
630 if ((entry->eflags & MAP_ENTRY_WRITECNT) != 0) {
631 /*
632 * Decrement the object's writemappings and
633 * possibly the vnode's v_writecount.
634 */
635 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
636 ("Submap with writecount"));
637 object = entry->object.vm_object;
638 KASSERT(object != NULL, ("No object for writecount"));
639 vm_pager_release_writecount(object, entry->start,
640 entry->end);
641 }
642 vm_map_entry_set_vnode_text(entry, false);
643 vm_map_entry_deallocate(entry, FALSE);
644 entry = next;
645 }
646 }
647
648 #ifdef INVARIANTS
649 static void
650 _vm_map_assert_locked(vm_map_t map, const char *file, int line)
651 {
652
653 if (map->system_map)
654 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
655 else
656 sx_assert_(&map->lock, SA_XLOCKED, file, line);
657 }
658
659 #define VM_MAP_ASSERT_LOCKED(map) \
660 _vm_map_assert_locked(map, LOCK_FILE, LOCK_LINE)
661
662 enum { VMMAP_CHECK_NONE, VMMAP_CHECK_UNLOCK, VMMAP_CHECK_ALL };
663 #ifdef DIAGNOSTIC
664 static int enable_vmmap_check = VMMAP_CHECK_UNLOCK;
665 #else
666 static int enable_vmmap_check = VMMAP_CHECK_NONE;
667 #endif
668 SYSCTL_INT(_debug, OID_AUTO, vmmap_check, CTLFLAG_RWTUN,
669 &enable_vmmap_check, 0, "Enable vm map consistency checking");
670
671 static void _vm_map_assert_consistent(vm_map_t map, int check);
672
673 #define VM_MAP_ASSERT_CONSISTENT(map) \
674 _vm_map_assert_consistent(map, VMMAP_CHECK_ALL)
675 #ifdef DIAGNOSTIC
676 #define VM_MAP_UNLOCK_CONSISTENT(map) do { \
677 if (map->nupdates > map->nentries) { \
678 _vm_map_assert_consistent(map, VMMAP_CHECK_UNLOCK); \
679 map->nupdates = 0; \
680 } \
681 } while (0)
682 #else
683 #define VM_MAP_UNLOCK_CONSISTENT(map)
684 #endif
685 #else
686 #define VM_MAP_ASSERT_LOCKED(map)
687 #define VM_MAP_ASSERT_CONSISTENT(map)
688 #define VM_MAP_UNLOCK_CONSISTENT(map)
689 #endif /* INVARIANTS */
690
691 void
692 _vm_map_unlock(vm_map_t map, const char *file, int line)
693 {
694
695 VM_MAP_UNLOCK_CONSISTENT(map);
696 if (map->system_map) {
697 #ifndef UMA_MD_SMALL_ALLOC
698 if (map == kernel_map && (map->flags & MAP_REPLENISH) != 0) {
699 uma_prealloc(kmapentzone, 1);
700 map->flags &= ~MAP_REPLENISH;
701 }
702 #endif
703 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
704 } else {
705 sx_xunlock_(&map->lock, file, line);
706 vm_map_process_deferred();
707 }
708 }
709
710 void
711 _vm_map_lock_read(vm_map_t map, const char *file, int line)
712 {
713
714 if (map->system_map)
715 mtx_lock_flags_(&map->system_mtx, 0, file, line);
716 else
717 sx_slock_(&map->lock, file, line);
718 }
719
720 void
721 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
722 {
723
724 if (map->system_map) {
725 KASSERT((map->flags & MAP_REPLENISH) == 0,
726 ("%s: MAP_REPLENISH leaked", __func__));
727 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
728 } else {
729 sx_sunlock_(&map->lock, file, line);
730 vm_map_process_deferred();
731 }
732 }
733
734 int
735 _vm_map_trylock(vm_map_t map, const char *file, int line)
736 {
737 int error;
738
739 error = map->system_map ?
740 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
741 !sx_try_xlock_(&map->lock, file, line);
742 if (error == 0)
743 map->timestamp++;
744 return (error == 0);
745 }
746
747 int
748 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
749 {
750 int error;
751
752 error = map->system_map ?
753 !mtx_trylock_flags_(&map->system_mtx, 0, file, line) :
754 !sx_try_slock_(&map->lock, file, line);
755 return (error == 0);
756 }
757
758 /*
759 * _vm_map_lock_upgrade: [ internal use only ]
760 *
761 * Tries to upgrade a read (shared) lock on the specified map to a write
762 * (exclusive) lock. Returns the value "" if the upgrade succeeds and a
763 * non-zero value if the upgrade fails. If the upgrade fails, the map is
764 * returned without a read or write lock held.
765 *
766 * Requires that the map be read locked.
767 */
768 int
769 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
770 {
771 unsigned int last_timestamp;
772
773 if (map->system_map) {
774 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
775 } else {
776 if (!sx_try_upgrade_(&map->lock, file, line)) {
777 last_timestamp = map->timestamp;
778 sx_sunlock_(&map->lock, file, line);
779 vm_map_process_deferred();
780 /*
781 * If the map's timestamp does not change while the
782 * map is unlocked, then the upgrade succeeds.
783 */
784 sx_xlock_(&map->lock, file, line);
785 if (last_timestamp != map->timestamp) {
786 sx_xunlock_(&map->lock, file, line);
787 return (1);
788 }
789 }
790 }
791 map->timestamp++;
792 return (0);
793 }
794
795 void
796 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
797 {
798
799 if (map->system_map) {
800 KASSERT((map->flags & MAP_REPLENISH) == 0,
801 ("%s: MAP_REPLENISH leaked", __func__));
802 mtx_assert_(&map->system_mtx, MA_OWNED, file, line);
803 } else {
804 VM_MAP_UNLOCK_CONSISTENT(map);
805 sx_downgrade_(&map->lock, file, line);
806 }
807 }
808
809 /*
810 * vm_map_locked:
811 *
812 * Returns a non-zero value if the caller holds a write (exclusive) lock
813 * on the specified map and the value "" otherwise.
814 */
815 int
816 vm_map_locked(vm_map_t map)
817 {
818
819 if (map->system_map)
820 return (mtx_owned(&map->system_mtx));
821 else
822 return (sx_xlocked(&map->lock));
823 }
824
825 /*
826 * _vm_map_unlock_and_wait:
827 *
828 * Atomically releases the lock on the specified map and puts the calling
829 * thread to sleep. The calling thread will remain asleep until either
830 * vm_map_wakeup() is performed on the map or the specified timeout is
831 * exceeded.
832 *
833 * WARNING! This function does not perform deferred deallocations of
834 * objects and map entries. Therefore, the calling thread is expected to
835 * reacquire the map lock after reawakening and later perform an ordinary
836 * unlock operation, such as vm_map_unlock(), before completing its
837 * operation on the map.
838 */
839 int
840 _vm_map_unlock_and_wait(vm_map_t map, int timo, const char *file, int line)
841 {
842
843 VM_MAP_UNLOCK_CONSISTENT(map);
844 mtx_lock(&map_sleep_mtx);
845 if (map->system_map) {
846 KASSERT((map->flags & MAP_REPLENISH) == 0,
847 ("%s: MAP_REPLENISH leaked", __func__));
848 mtx_unlock_flags_(&map->system_mtx, 0, file, line);
849 } else {
850 sx_xunlock_(&map->lock, file, line);
851 }
852 return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps",
853 timo));
854 }
855
856 /*
857 * vm_map_wakeup:
858 *
859 * Awaken any threads that have slept on the map using
860 * vm_map_unlock_and_wait().
861 */
862 void
863 vm_map_wakeup(vm_map_t map)
864 {
865
866 /*
867 * Acquire and release map_sleep_mtx to prevent a wakeup()
868 * from being performed (and lost) between the map unlock
869 * and the msleep() in _vm_map_unlock_and_wait().
870 */
871 mtx_lock(&map_sleep_mtx);
872 mtx_unlock(&map_sleep_mtx);
873 wakeup(&map->root);
874 }
875
876 void
877 vm_map_busy(vm_map_t map)
878 {
879
880 VM_MAP_ASSERT_LOCKED(map);
881 map->busy++;
882 }
883
884 void
885 vm_map_unbusy(vm_map_t map)
886 {
887
888 VM_MAP_ASSERT_LOCKED(map);
889 KASSERT(map->busy, ("vm_map_unbusy: not busy"));
890 if (--map->busy == 0 && (map->flags & MAP_BUSY_WAKEUP)) {
891 vm_map_modflags(map, 0, MAP_BUSY_WAKEUP);
892 wakeup(&map->busy);
893 }
894 }
895
896 void
897 vm_map_wait_busy(vm_map_t map)
898 {
899
900 VM_MAP_ASSERT_LOCKED(map);
901 while (map->busy) {
902 vm_map_modflags(map, MAP_BUSY_WAKEUP, 0);
903 if (map->system_map)
904 msleep(&map->busy, &map->system_mtx, 0, "mbusy", 0);
905 else
906 sx_sleep(&map->busy, &map->lock, 0, "mbusy", 0);
907 }
908 map->timestamp++;
909 }
910
911 long
912 vmspace_resident_count(struct vmspace *vmspace)
913 {
914 return pmap_resident_count(vmspace_pmap(vmspace));
915 }
916
917 /*
918 * Initialize an existing vm_map structure
919 * such as that in the vmspace structure.
920 */
921 static void
922 _vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
923 {
924
925 map->header.eflags = MAP_ENTRY_HEADER;
926 map->needs_wakeup = FALSE;
927 map->system_map = 0;
928 map->pmap = pmap;
929 map->header.end = min;
930 map->header.start = max;
931 map->flags = 0;
932 map->header.left = map->header.right = &map->header;
933 map->root = NULL;
934 map->timestamp = 0;
935 map->busy = 0;
936 map->anon_loc = 0;
937 #ifdef DIAGNOSTIC
938 map->nupdates = 0;
939 #endif
940 }
941
942 void
943 vm_map_init(vm_map_t map, pmap_t pmap, vm_offset_t min, vm_offset_t max)
944 {
945
946 _vm_map_init(map, pmap, min, max);
947 mtx_init(&map->system_mtx, "vm map (system)", NULL,
948 MTX_DEF | MTX_DUPOK);
949 sx_init(&map->lock, "vm map (user)");
950 }
951
952 /*
953 * vm_map_entry_dispose: [ internal use only ]
954 *
955 * Inverse of vm_map_entry_create.
956 */
957 static void
958 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
959 {
960 uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
961 }
962
963 /*
964 * vm_map_entry_create: [ internal use only ]
965 *
966 * Allocates a VM map entry for insertion.
967 * No entry fields are filled in.
968 */
969 static vm_map_entry_t
970 vm_map_entry_create(vm_map_t map)
971 {
972 vm_map_entry_t new_entry;
973
974 #ifndef UMA_MD_SMALL_ALLOC
975 if (map == kernel_map) {
976 VM_MAP_ASSERT_LOCKED(map);
977
978 /*
979 * A new slab of kernel map entries cannot be allocated at this
980 * point because the kernel map has not yet been updated to
981 * reflect the caller's request. Therefore, we allocate a new
982 * map entry, dipping into the reserve if necessary, and set a
983 * flag indicating that the reserve must be replenished before
984 * the map is unlocked.
985 */
986 new_entry = uma_zalloc(kmapentzone, M_NOWAIT | M_NOVM);
987 if (new_entry == NULL) {
988 new_entry = uma_zalloc(kmapentzone,
989 M_NOWAIT | M_NOVM | M_USE_RESERVE);
990 kernel_map->flags |= MAP_REPLENISH;
991 }
992 } else
993 #endif
994 if (map->system_map) {
995 new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
996 } else {
997 new_entry = uma_zalloc(mapentzone, M_WAITOK);
998 }
999 KASSERT(new_entry != NULL,
1000 ("vm_map_entry_create: kernel resources exhausted"));
1001 return (new_entry);
1002 }
1003
1004 /*
1005 * vm_map_entry_set_behavior:
1006 *
1007 * Set the expected access behavior, either normal, random, or
1008 * sequential.
1009 */
1010 static inline void
1011 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
1012 {
1013 entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
1014 (behavior & MAP_ENTRY_BEHAV_MASK);
1015 }
1016
1017 /*
1018 * vm_map_entry_max_free_{left,right}:
1019 *
1020 * Compute the size of the largest free gap between two entries,
1021 * one the root of a tree and the other the ancestor of that root
1022 * that is the least or greatest ancestor found on the search path.
1023 */
1024 static inline vm_size_t
1025 vm_map_entry_max_free_left(vm_map_entry_t root, vm_map_entry_t left_ancestor)
1026 {
1027
1028 return (root->left != left_ancestor ?
1029 root->left->max_free : root->start - left_ancestor->end);
1030 }
1031
1032 static inline vm_size_t
1033 vm_map_entry_max_free_right(vm_map_entry_t root, vm_map_entry_t right_ancestor)
1034 {
1035
1036 return (root->right != right_ancestor ?
1037 root->right->max_free : right_ancestor->start - root->end);
1038 }
1039
1040 /*
1041 * vm_map_entry_{pred,succ}:
1042 *
1043 * Find the {predecessor, successor} of the entry by taking one step
1044 * in the appropriate direction and backtracking as much as necessary.
1045 * vm_map_entry_succ is defined in vm_map.h.
1046 */
1047 static inline vm_map_entry_t
1048 vm_map_entry_pred(vm_map_entry_t entry)
1049 {
1050 vm_map_entry_t prior;
1051
1052 prior = entry->left;
1053 if (prior->right->start < entry->start) {
1054 do
1055 prior = prior->right;
1056 while (prior->right != entry);
1057 }
1058 return (prior);
1059 }
1060
1061 static inline vm_size_t
1062 vm_size_max(vm_size_t a, vm_size_t b)
1063 {
1064
1065 return (a > b ? a : b);
1066 }
1067
1068 #define SPLAY_LEFT_STEP(root, y, llist, rlist, test) do { \
1069 vm_map_entry_t z; \
1070 vm_size_t max_free; \
1071 \
1072 /* \
1073 * Infer root->right->max_free == root->max_free when \
1074 * y->max_free < root->max_free || root->max_free == 0. \
1075 * Otherwise, look right to find it. \
1076 */ \
1077 y = root->left; \
1078 max_free = root->max_free; \
1079 KASSERT(max_free == vm_size_max( \
1080 vm_map_entry_max_free_left(root, llist), \
1081 vm_map_entry_max_free_right(root, rlist)), \
1082 ("%s: max_free invariant fails", __func__)); \
1083 if (max_free - 1 < vm_map_entry_max_free_left(root, llist)) \
1084 max_free = vm_map_entry_max_free_right(root, rlist); \
1085 if (y != llist && (test)) { \
1086 /* Rotate right and make y root. */ \
1087 z = y->right; \
1088 if (z != root) { \
1089 root->left = z; \
1090 y->right = root; \
1091 if (max_free < y->max_free) \
1092 root->max_free = max_free = \
1093 vm_size_max(max_free, z->max_free); \
1094 } else if (max_free < y->max_free) \
1095 root->max_free = max_free = \
1096 vm_size_max(max_free, root->start - y->end);\
1097 root = y; \
1098 y = root->left; \
1099 } \
1100 /* Copy right->max_free. Put root on rlist. */ \
1101 root->max_free = max_free; \
1102 KASSERT(max_free == vm_map_entry_max_free_right(root, rlist), \
1103 ("%s: max_free not copied from right", __func__)); \
1104 root->left = rlist; \
1105 rlist = root; \
1106 root = y != llist ? y : NULL; \
1107 } while (0)
1108
1109 #define SPLAY_RIGHT_STEP(root, y, llist, rlist, test) do { \
1110 vm_map_entry_t z; \
1111 vm_size_t max_free; \
1112 \
1113 /* \
1114 * Infer root->left->max_free == root->max_free when \
1115 * y->max_free < root->max_free || root->max_free == 0. \
1116 * Otherwise, look left to find it. \
1117 */ \
1118 y = root->right; \
1119 max_free = root->max_free; \
1120 KASSERT(max_free == vm_size_max( \
1121 vm_map_entry_max_free_left(root, llist), \
1122 vm_map_entry_max_free_right(root, rlist)), \
1123 ("%s: max_free invariant fails", __func__)); \
1124 if (max_free - 1 < vm_map_entry_max_free_right(root, rlist)) \
1125 max_free = vm_map_entry_max_free_left(root, llist); \
1126 if (y != rlist && (test)) { \
1127 /* Rotate left and make y root. */ \
1128 z = y->left; \
1129 if (z != root) { \
1130 root->right = z; \
1131 y->left = root; \
1132 if (max_free < y->max_free) \
1133 root->max_free = max_free = \
1134 vm_size_max(max_free, z->max_free); \
1135 } else if (max_free < y->max_free) \
1136 root->max_free = max_free = \
1137 vm_size_max(max_free, y->start - root->end);\
1138 root = y; \
1139 y = root->right; \
1140 } \
1141 /* Copy left->max_free. Put root on llist. */ \
1142 root->max_free = max_free; \
1143 KASSERT(max_free == vm_map_entry_max_free_left(root, llist), \
1144 ("%s: max_free not copied from left", __func__)); \
1145 root->right = llist; \
1146 llist = root; \
1147 root = y != rlist ? y : NULL; \
1148 } while (0)
1149
1150 /*
1151 * Walk down the tree until we find addr or a gap where addr would go, breaking
1152 * off left and right subtrees of nodes less than, or greater than addr. Treat
1153 * subtrees with root->max_free < length as empty trees. llist and rlist are
1154 * the two sides in reverse order (bottom-up), with llist linked by the right
1155 * pointer and rlist linked by the left pointer in the vm_map_entry, and both
1156 * lists terminated by &map->header. This function, and the subsequent call to
1157 * vm_map_splay_merge_{left,right,pred,succ}, rely on the start and end address
1158 * values in &map->header.
1159 */
1160 static __always_inline vm_map_entry_t
1161 vm_map_splay_split(vm_map_t map, vm_offset_t addr, vm_size_t length,
1162 vm_map_entry_t *llist, vm_map_entry_t *rlist)
1163 {
1164 vm_map_entry_t left, right, root, y;
1165
1166 left = right = &map->header;
1167 root = map->root;
1168 while (root != NULL && root->max_free >= length) {
1169 KASSERT(left->end <= root->start &&
1170 root->end <= right->start,
1171 ("%s: root not within tree bounds", __func__));
1172 if (addr < root->start) {
1173 SPLAY_LEFT_STEP(root, y, left, right,
1174 y->max_free >= length && addr < y->start);
1175 } else if (addr >= root->end) {
1176 SPLAY_RIGHT_STEP(root, y, left, right,
1177 y->max_free >= length && addr >= y->end);
1178 } else
1179 break;
1180 }
1181 *llist = left;
1182 *rlist = right;
1183 return (root);
1184 }
1185
1186 static __always_inline void
1187 vm_map_splay_findnext(vm_map_entry_t root, vm_map_entry_t *rlist)
1188 {
1189 vm_map_entry_t hi, right, y;
1190
1191 right = *rlist;
1192 hi = root->right == right ? NULL : root->right;
1193 if (hi == NULL)
1194 return;
1195 do
1196 SPLAY_LEFT_STEP(hi, y, root, right, true);
1197 while (hi != NULL);
1198 *rlist = right;
1199 }
1200
1201 static __always_inline void
1202 vm_map_splay_findprev(vm_map_entry_t root, vm_map_entry_t *llist)
1203 {
1204 vm_map_entry_t left, lo, y;
1205
1206 left = *llist;
1207 lo = root->left == left ? NULL : root->left;
1208 if (lo == NULL)
1209 return;
1210 do
1211 SPLAY_RIGHT_STEP(lo, y, left, root, true);
1212 while (lo != NULL);
1213 *llist = left;
1214 }
1215
1216 static inline void
1217 vm_map_entry_swap(vm_map_entry_t *a, vm_map_entry_t *b)
1218 {
1219 vm_map_entry_t tmp;
1220
1221 tmp = *b;
1222 *b = *a;
1223 *a = tmp;
1224 }
1225
1226 /*
1227 * Walk back up the two spines, flip the pointers and set max_free. The
1228 * subtrees of the root go at the bottom of llist and rlist.
1229 */
1230 static vm_size_t
1231 vm_map_splay_merge_left_walk(vm_map_entry_t header, vm_map_entry_t root,
1232 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t llist)
1233 {
1234 do {
1235 /*
1236 * The max_free values of the children of llist are in
1237 * llist->max_free and max_free. Update with the
1238 * max value.
1239 */
1240 llist->max_free = max_free =
1241 vm_size_max(llist->max_free, max_free);
1242 vm_map_entry_swap(&llist->right, &tail);
1243 vm_map_entry_swap(&tail, &llist);
1244 } while (llist != header);
1245 root->left = tail;
1246 return (max_free);
1247 }
1248
1249 /*
1250 * When llist is known to be the predecessor of root.
1251 */
1252 static inline vm_size_t
1253 vm_map_splay_merge_pred(vm_map_entry_t header, vm_map_entry_t root,
1254 vm_map_entry_t llist)
1255 {
1256 vm_size_t max_free;
1257
1258 max_free = root->start - llist->end;
1259 if (llist != header) {
1260 max_free = vm_map_splay_merge_left_walk(header, root,
1261 root, max_free, llist);
1262 } else {
1263 root->left = header;
1264 header->right = root;
1265 }
1266 return (max_free);
1267 }
1268
1269 /*
1270 * When llist may or may not be the predecessor of root.
1271 */
1272 static inline vm_size_t
1273 vm_map_splay_merge_left(vm_map_entry_t header, vm_map_entry_t root,
1274 vm_map_entry_t llist)
1275 {
1276 vm_size_t max_free;
1277
1278 max_free = vm_map_entry_max_free_left(root, llist);
1279 if (llist != header) {
1280 max_free = vm_map_splay_merge_left_walk(header, root,
1281 root->left == llist ? root : root->left,
1282 max_free, llist);
1283 }
1284 return (max_free);
1285 }
1286
1287 static vm_size_t
1288 vm_map_splay_merge_right_walk(vm_map_entry_t header, vm_map_entry_t root,
1289 vm_map_entry_t tail, vm_size_t max_free, vm_map_entry_t rlist)
1290 {
1291 do {
1292 /*
1293 * The max_free values of the children of rlist are in
1294 * rlist->max_free and max_free. Update with the
1295 * max value.
1296 */
1297 rlist->max_free = max_free =
1298 vm_size_max(rlist->max_free, max_free);
1299 vm_map_entry_swap(&rlist->left, &tail);
1300 vm_map_entry_swap(&tail, &rlist);
1301 } while (rlist != header);
1302 root->right = tail;
1303 return (max_free);
1304 }
1305
1306 /*
1307 * When rlist is known to be the succecessor of root.
1308 */
1309 static inline vm_size_t
1310 vm_map_splay_merge_succ(vm_map_entry_t header, vm_map_entry_t root,
1311 vm_map_entry_t rlist)
1312 {
1313 vm_size_t max_free;
1314
1315 max_free = rlist->start - root->end;
1316 if (rlist != header) {
1317 max_free = vm_map_splay_merge_right_walk(header, root,
1318 root, max_free, rlist);
1319 } else {
1320 root->right = header;
1321 header->left = root;
1322 }
1323 return (max_free);
1324 }
1325
1326 /*
1327 * When rlist may or may not be the succecessor of root.
1328 */
1329 static inline vm_size_t
1330 vm_map_splay_merge_right(vm_map_entry_t header, vm_map_entry_t root,
1331 vm_map_entry_t rlist)
1332 {
1333 vm_size_t max_free;
1334
1335 max_free = vm_map_entry_max_free_right(root, rlist);
1336 if (rlist != header) {
1337 max_free = vm_map_splay_merge_right_walk(header, root,
1338 root->right == rlist ? root : root->right,
1339 max_free, rlist);
1340 }
1341 return (max_free);
1342 }
1343
1344 /*
1345 * vm_map_splay:
1346 *
1347 * The Sleator and Tarjan top-down splay algorithm with the
1348 * following variation. Max_free must be computed bottom-up, so
1349 * on the downward pass, maintain the left and right spines in
1350 * reverse order. Then, make a second pass up each side to fix
1351 * the pointers and compute max_free. The time bound is O(log n)
1352 * amortized.
1353 *
1354 * The tree is threaded, which means that there are no null pointers.
1355 * When a node has no left child, its left pointer points to its
1356 * predecessor, which the last ancestor on the search path from the root
1357 * where the search branched right. Likewise, when a node has no right
1358 * child, its right pointer points to its successor. The map header node
1359 * is the predecessor of the first map entry, and the successor of the
1360 * last.
1361 *
1362 * The new root is the vm_map_entry containing "addr", or else an
1363 * adjacent entry (lower if possible) if addr is not in the tree.
1364 *
1365 * The map must be locked, and leaves it so.
1366 *
1367 * Returns: the new root.
1368 */
1369 static vm_map_entry_t
1370 vm_map_splay(vm_map_t map, vm_offset_t addr)
1371 {
1372 vm_map_entry_t header, llist, rlist, root;
1373 vm_size_t max_free_left, max_free_right;
1374
1375 header = &map->header;
1376 root = vm_map_splay_split(map, addr, 0, &llist, &rlist);
1377 if (root != NULL) {
1378 max_free_left = vm_map_splay_merge_left(header, root, llist);
1379 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1380 } else if (llist != header) {
1381 /*
1382 * Recover the greatest node in the left
1383 * subtree and make it the root.
1384 */
1385 root = llist;
1386 llist = root->right;
1387 max_free_left = vm_map_splay_merge_left(header, root, llist);
1388 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1389 } else if (rlist != header) {
1390 /*
1391 * Recover the least node in the right
1392 * subtree and make it the root.
1393 */
1394 root = rlist;
1395 rlist = root->left;
1396 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1397 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1398 } else {
1399 /* There is no root. */
1400 return (NULL);
1401 }
1402 root->max_free = vm_size_max(max_free_left, max_free_right);
1403 map->root = root;
1404 VM_MAP_ASSERT_CONSISTENT(map);
1405 return (root);
1406 }
1407
1408 /*
1409 * vm_map_entry_{un,}link:
1410 *
1411 * Insert/remove entries from maps. On linking, if new entry clips
1412 * existing entry, trim existing entry to avoid overlap, and manage
1413 * offsets. On unlinking, merge disappearing entry with neighbor, if
1414 * called for, and manage offsets. Callers should not modify fields in
1415 * entries already mapped.
1416 */
1417 static void
1418 vm_map_entry_link(vm_map_t map, vm_map_entry_t entry)
1419 {
1420 vm_map_entry_t header, llist, rlist, root;
1421 vm_size_t max_free_left, max_free_right;
1422
1423 CTR3(KTR_VM,
1424 "vm_map_entry_link: map %p, nentries %d, entry %p", map,
1425 map->nentries, entry);
1426 VM_MAP_ASSERT_LOCKED(map);
1427 map->nentries++;
1428 header = &map->header;
1429 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1430 if (root == NULL) {
1431 /*
1432 * The new entry does not overlap any existing entry in the
1433 * map, so it becomes the new root of the map tree.
1434 */
1435 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1436 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1437 } else if (entry->start == root->start) {
1438 /*
1439 * The new entry is a clone of root, with only the end field
1440 * changed. The root entry will be shrunk to abut the new
1441 * entry, and will be the right child of the new root entry in
1442 * the modified map.
1443 */
1444 KASSERT(entry->end < root->end,
1445 ("%s: clip_start not within entry", __func__));
1446 vm_map_splay_findprev(root, &llist);
1447 root->offset += entry->end - root->start;
1448 root->start = entry->end;
1449 max_free_left = vm_map_splay_merge_pred(header, entry, llist);
1450 max_free_right = root->max_free = vm_size_max(
1451 vm_map_splay_merge_pred(entry, root, entry),
1452 vm_map_splay_merge_right(header, root, rlist));
1453 } else {
1454 /*
1455 * The new entry is a clone of root, with only the start field
1456 * changed. The root entry will be shrunk to abut the new
1457 * entry, and will be the left child of the new root entry in
1458 * the modified map.
1459 */
1460 KASSERT(entry->end == root->end,
1461 ("%s: clip_start not within entry", __func__));
1462 vm_map_splay_findnext(root, &rlist);
1463 entry->offset += entry->start - root->start;
1464 root->end = entry->start;
1465 max_free_left = root->max_free = vm_size_max(
1466 vm_map_splay_merge_left(header, root, llist),
1467 vm_map_splay_merge_succ(entry, root, entry));
1468 max_free_right = vm_map_splay_merge_succ(header, entry, rlist);
1469 }
1470 entry->max_free = vm_size_max(max_free_left, max_free_right);
1471 map->root = entry;
1472 VM_MAP_ASSERT_CONSISTENT(map);
1473 }
1474
1475 enum unlink_merge_type {
1476 UNLINK_MERGE_NONE,
1477 UNLINK_MERGE_NEXT
1478 };
1479
1480 static void
1481 vm_map_entry_unlink(vm_map_t map, vm_map_entry_t entry,
1482 enum unlink_merge_type op)
1483 {
1484 vm_map_entry_t header, llist, rlist, root;
1485 vm_size_t max_free_left, max_free_right;
1486
1487 VM_MAP_ASSERT_LOCKED(map);
1488 header = &map->header;
1489 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1490 KASSERT(root != NULL,
1491 ("vm_map_entry_unlink: unlink object not mapped"));
1492
1493 vm_map_splay_findprev(root, &llist);
1494 vm_map_splay_findnext(root, &rlist);
1495 if (op == UNLINK_MERGE_NEXT) {
1496 rlist->start = root->start;
1497 rlist->offset = root->offset;
1498 }
1499 if (llist != header) {
1500 root = llist;
1501 llist = root->right;
1502 max_free_left = vm_map_splay_merge_left(header, root, llist);
1503 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1504 } else if (rlist != header) {
1505 root = rlist;
1506 rlist = root->left;
1507 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1508 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1509 } else {
1510 header->left = header->right = header;
1511 root = NULL;
1512 }
1513 if (root != NULL)
1514 root->max_free = vm_size_max(max_free_left, max_free_right);
1515 map->root = root;
1516 VM_MAP_ASSERT_CONSISTENT(map);
1517 map->nentries--;
1518 CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
1519 map->nentries, entry);
1520 }
1521
1522 /*
1523 * vm_map_entry_resize:
1524 *
1525 * Resize a vm_map_entry, recompute the amount of free space that
1526 * follows it and propagate that value up the tree.
1527 *
1528 * The map must be locked, and leaves it so.
1529 */
1530 static void
1531 vm_map_entry_resize(vm_map_t map, vm_map_entry_t entry, vm_size_t grow_amount)
1532 {
1533 vm_map_entry_t header, llist, rlist, root;
1534
1535 VM_MAP_ASSERT_LOCKED(map);
1536 header = &map->header;
1537 root = vm_map_splay_split(map, entry->start, 0, &llist, &rlist);
1538 KASSERT(root != NULL, ("%s: resize object not mapped", __func__));
1539 vm_map_splay_findnext(root, &rlist);
1540 entry->end += grow_amount;
1541 root->max_free = vm_size_max(
1542 vm_map_splay_merge_left(header, root, llist),
1543 vm_map_splay_merge_succ(header, root, rlist));
1544 map->root = root;
1545 VM_MAP_ASSERT_CONSISTENT(map);
1546 CTR4(KTR_VM, "%s: map %p, nentries %d, entry %p",
1547 __func__, map, map->nentries, entry);
1548 }
1549
1550 /*
1551 * vm_map_lookup_entry: [ internal use only ]
1552 *
1553 * Finds the map entry containing (or
1554 * immediately preceding) the specified address
1555 * in the given map; the entry is returned
1556 * in the "entry" parameter. The boolean
1557 * result indicates whether the address is
1558 * actually contained in the map.
1559 */
1560 boolean_t
1561 vm_map_lookup_entry(
1562 vm_map_t map,
1563 vm_offset_t address,
1564 vm_map_entry_t *entry) /* OUT */
1565 {
1566 vm_map_entry_t cur, header, lbound, ubound;
1567 boolean_t locked;
1568
1569 /*
1570 * If the map is empty, then the map entry immediately preceding
1571 * "address" is the map's header.
1572 */
1573 header = &map->header;
1574 cur = map->root;
1575 if (cur == NULL) {
1576 *entry = header;
1577 return (FALSE);
1578 }
1579 if (address >= cur->start && cur->end > address) {
1580 *entry = cur;
1581 return (TRUE);
1582 }
1583 if ((locked = vm_map_locked(map)) ||
1584 sx_try_upgrade(&map->lock)) {
1585 /*
1586 * Splay requires a write lock on the map. However, it only
1587 * restructures the binary search tree; it does not otherwise
1588 * change the map. Thus, the map's timestamp need not change
1589 * on a temporary upgrade.
1590 */
1591 cur = vm_map_splay(map, address);
1592 if (!locked) {
1593 VM_MAP_UNLOCK_CONSISTENT(map);
1594 sx_downgrade(&map->lock);
1595 }
1596
1597 /*
1598 * If "address" is contained within a map entry, the new root
1599 * is that map entry. Otherwise, the new root is a map entry
1600 * immediately before or after "address".
1601 */
1602 if (address < cur->start) {
1603 *entry = header;
1604 return (FALSE);
1605 }
1606 *entry = cur;
1607 return (address < cur->end);
1608 }
1609 /*
1610 * Since the map is only locked for read access, perform a
1611 * standard binary search tree lookup for "address".
1612 */
1613 lbound = ubound = header;
1614 for (;;) {
1615 if (address < cur->start) {
1616 ubound = cur;
1617 cur = cur->left;
1618 if (cur == lbound)
1619 break;
1620 } else if (cur->end <= address) {
1621 lbound = cur;
1622 cur = cur->right;
1623 if (cur == ubound)
1624 break;
1625 } else {
1626 *entry = cur;
1627 return (TRUE);
1628 }
1629 }
1630 *entry = lbound;
1631 return (FALSE);
1632 }
1633
1634 /*
1635 * vm_map_insert:
1636 *
1637 * Inserts the given whole VM object into the target
1638 * map at the specified address range. The object's
1639 * size should match that of the address range.
1640 *
1641 * Requires that the map be locked, and leaves it so.
1642 *
1643 * If object is non-NULL, ref count must be bumped by caller
1644 * prior to making call to account for the new entry.
1645 */
1646 int
1647 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1648 vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max, int cow)
1649 {
1650 vm_map_entry_t new_entry, next_entry, prev_entry;
1651 struct ucred *cred;
1652 vm_eflags_t protoeflags;
1653 vm_inherit_t inheritance;
1654 u_long bdry;
1655 u_int bidx;
1656
1657 VM_MAP_ASSERT_LOCKED(map);
1658 KASSERT(object != kernel_object ||
1659 (cow & MAP_COPY_ON_WRITE) == 0,
1660 ("vm_map_insert: kernel object and COW"));
1661 KASSERT(object == NULL || (cow & MAP_NOFAULT) == 0 ||
1662 (cow & MAP_SPLIT_BOUNDARY_MASK) != 0,
1663 ("vm_map_insert: paradoxical MAP_NOFAULT request, obj %p cow %#x",
1664 object, cow));
1665 KASSERT((prot & ~max) == 0,
1666 ("prot %#x is not subset of max_prot %#x", prot, max));
1667
1668 /*
1669 * Check that the start and end points are not bogus.
1670 */
1671 if (start == end || !vm_map_range_valid(map, start, end))
1672 return (KERN_INVALID_ADDRESS);
1673
1674 if ((map->flags & MAP_WXORX) != 0 && (prot & (VM_PROT_WRITE |
1675 VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE))
1676 return (KERN_PROTECTION_FAILURE);
1677
1678 /*
1679 * Find the entry prior to the proposed starting address; if it's part
1680 * of an existing entry, this range is bogus.
1681 */
1682 if (vm_map_lookup_entry(map, start, &prev_entry))
1683 return (KERN_NO_SPACE);
1684
1685 /*
1686 * Assert that the next entry doesn't overlap the end point.
1687 */
1688 next_entry = vm_map_entry_succ(prev_entry);
1689 if (next_entry->start < end)
1690 return (KERN_NO_SPACE);
1691
1692 if ((cow & MAP_CREATE_GUARD) != 0 && (object != NULL ||
1693 max != VM_PROT_NONE))
1694 return (KERN_INVALID_ARGUMENT);
1695
1696 protoeflags = 0;
1697 if (cow & MAP_COPY_ON_WRITE)
1698 protoeflags |= MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY;
1699 if (cow & MAP_NOFAULT)
1700 protoeflags |= MAP_ENTRY_NOFAULT;
1701 if (cow & MAP_DISABLE_SYNCER)
1702 protoeflags |= MAP_ENTRY_NOSYNC;
1703 if (cow & MAP_DISABLE_COREDUMP)
1704 protoeflags |= MAP_ENTRY_NOCOREDUMP;
1705 if (cow & MAP_STACK_GROWS_DOWN)
1706 protoeflags |= MAP_ENTRY_GROWS_DOWN;
1707 if (cow & MAP_STACK_GROWS_UP)
1708 protoeflags |= MAP_ENTRY_GROWS_UP;
1709 if (cow & MAP_WRITECOUNT)
1710 protoeflags |= MAP_ENTRY_WRITECNT;
1711 if (cow & MAP_VN_EXEC)
1712 protoeflags |= MAP_ENTRY_VN_EXEC;
1713 if ((cow & MAP_CREATE_GUARD) != 0)
1714 protoeflags |= MAP_ENTRY_GUARD;
1715 if ((cow & MAP_CREATE_STACK_GAP_DN) != 0)
1716 protoeflags |= MAP_ENTRY_STACK_GAP_DN;
1717 if ((cow & MAP_CREATE_STACK_GAP_UP) != 0)
1718 protoeflags |= MAP_ENTRY_STACK_GAP_UP;
1719 if (cow & MAP_INHERIT_SHARE)
1720 inheritance = VM_INHERIT_SHARE;
1721 else
1722 inheritance = VM_INHERIT_DEFAULT;
1723 if ((cow & MAP_SPLIT_BOUNDARY_MASK) != 0) {
1724 /* This magically ignores index 0, for usual page size. */
1725 bidx = (cow & MAP_SPLIT_BOUNDARY_MASK) >>
1726 MAP_SPLIT_BOUNDARY_SHIFT;
1727 if (bidx >= MAXPAGESIZES)
1728 return (KERN_INVALID_ARGUMENT);
1729 bdry = pagesizes[bidx] - 1;
1730 if ((start & bdry) != 0 || (end & bdry) != 0)
1731 return (KERN_INVALID_ARGUMENT);
1732 protoeflags |= bidx << MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
1733 }
1734
1735 cred = NULL;
1736 if ((cow & (MAP_ACC_NO_CHARGE | MAP_NOFAULT | MAP_CREATE_GUARD)) != 0)
1737 goto charged;
1738 if ((cow & MAP_ACC_CHARGED) || ((prot & VM_PROT_WRITE) &&
1739 ((protoeflags & MAP_ENTRY_NEEDS_COPY) || object == NULL))) {
1740 if (!(cow & MAP_ACC_CHARGED) && !swap_reserve(end - start))
1741 return (KERN_RESOURCE_SHORTAGE);
1742 KASSERT(object == NULL ||
1743 (protoeflags & MAP_ENTRY_NEEDS_COPY) != 0 ||
1744 object->cred == NULL,
1745 ("overcommit: vm_map_insert o %p", object));
1746 cred = curthread->td_ucred;
1747 }
1748
1749 charged:
1750 /* Expand the kernel pmap, if necessary. */
1751 if (map == kernel_map && end > kernel_vm_end)
1752 pmap_growkernel(end);
1753 if (object != NULL) {
1754 /*
1755 * OBJ_ONEMAPPING must be cleared unless this mapping
1756 * is trivially proven to be the only mapping for any
1757 * of the object's pages. (Object granularity
1758 * reference counting is insufficient to recognize
1759 * aliases with precision.)
1760 */
1761 if ((object->flags & OBJ_ANON) != 0) {
1762 VM_OBJECT_WLOCK(object);
1763 if (object->ref_count > 1 || object->shadow_count != 0)
1764 vm_object_clear_flag(object, OBJ_ONEMAPPING);
1765 VM_OBJECT_WUNLOCK(object);
1766 }
1767 } else if ((prev_entry->eflags & ~MAP_ENTRY_USER_WIRED) ==
1768 protoeflags &&
1769 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP |
1770 MAP_VN_EXEC)) == 0 &&
1771 prev_entry->end == start && (prev_entry->cred == cred ||
1772 (prev_entry->object.vm_object != NULL &&
1773 prev_entry->object.vm_object->cred == cred)) &&
1774 vm_object_coalesce(prev_entry->object.vm_object,
1775 prev_entry->offset,
1776 (vm_size_t)(prev_entry->end - prev_entry->start),
1777 (vm_size_t)(end - prev_entry->end), cred != NULL &&
1778 (protoeflags & MAP_ENTRY_NEEDS_COPY) == 0)) {
1779 /*
1780 * We were able to extend the object. Determine if we
1781 * can extend the previous map entry to include the
1782 * new range as well.
1783 */
1784 if (prev_entry->inheritance == inheritance &&
1785 prev_entry->protection == prot &&
1786 prev_entry->max_protection == max &&
1787 prev_entry->wired_count == 0) {
1788 KASSERT((prev_entry->eflags & MAP_ENTRY_USER_WIRED) ==
1789 0, ("prev_entry %p has incoherent wiring",
1790 prev_entry));
1791 if ((prev_entry->eflags & MAP_ENTRY_GUARD) == 0)
1792 map->size += end - prev_entry->end;
1793 vm_map_entry_resize(map, prev_entry,
1794 end - prev_entry->end);
1795 vm_map_try_merge_entries(map, prev_entry, next_entry);
1796 return (KERN_SUCCESS);
1797 }
1798
1799 /*
1800 * If we can extend the object but cannot extend the
1801 * map entry, we have to create a new map entry. We
1802 * must bump the ref count on the extended object to
1803 * account for it. object may be NULL.
1804 */
1805 object = prev_entry->object.vm_object;
1806 offset = prev_entry->offset +
1807 (prev_entry->end - prev_entry->start);
1808 vm_object_reference(object);
1809 if (cred != NULL && object != NULL && object->cred != NULL &&
1810 !(prev_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
1811 /* Object already accounts for this uid. */
1812 cred = NULL;
1813 }
1814 }
1815 if (cred != NULL)
1816 crhold(cred);
1817
1818 /*
1819 * Create a new entry
1820 */
1821 new_entry = vm_map_entry_create(map);
1822 new_entry->start = start;
1823 new_entry->end = end;
1824 new_entry->cred = NULL;
1825
1826 new_entry->eflags = protoeflags;
1827 new_entry->object.vm_object = object;
1828 new_entry->offset = offset;
1829
1830 new_entry->inheritance = inheritance;
1831 new_entry->protection = prot;
1832 new_entry->max_protection = max;
1833 new_entry->wired_count = 0;
1834 new_entry->wiring_thread = NULL;
1835 new_entry->read_ahead = VM_FAULT_READ_AHEAD_INIT;
1836 new_entry->next_read = start;
1837
1838 KASSERT(cred == NULL || !ENTRY_CHARGED(new_entry),
1839 ("overcommit: vm_map_insert leaks vm_map %p", new_entry));
1840 new_entry->cred = cred;
1841
1842 /*
1843 * Insert the new entry into the list
1844 */
1845 vm_map_entry_link(map, new_entry);
1846 if ((new_entry->eflags & MAP_ENTRY_GUARD) == 0)
1847 map->size += new_entry->end - new_entry->start;
1848
1849 /*
1850 * Try to coalesce the new entry with both the previous and next
1851 * entries in the list. Previously, we only attempted to coalesce
1852 * with the previous entry when object is NULL. Here, we handle the
1853 * other cases, which are less common.
1854 */
1855 vm_map_try_merge_entries(map, prev_entry, new_entry);
1856 vm_map_try_merge_entries(map, new_entry, next_entry);
1857
1858 if ((cow & (MAP_PREFAULT | MAP_PREFAULT_PARTIAL)) != 0) {
1859 vm_map_pmap_enter(map, start, prot, object, OFF_TO_IDX(offset),
1860 end - start, cow & MAP_PREFAULT_PARTIAL);
1861 }
1862
1863 return (KERN_SUCCESS);
1864 }
1865
1866 /*
1867 * vm_map_findspace:
1868 *
1869 * Find the first fit (lowest VM address) for "length" free bytes
1870 * beginning at address >= start in the given map.
1871 *
1872 * In a vm_map_entry, "max_free" is the maximum amount of
1873 * contiguous free space between an entry in its subtree and a
1874 * neighbor of that entry. This allows finding a free region in
1875 * one path down the tree, so O(log n) amortized with splay
1876 * trees.
1877 *
1878 * The map must be locked, and leaves it so.
1879 *
1880 * Returns: starting address if sufficient space,
1881 * vm_map_max(map)-length+1 if insufficient space.
1882 */
1883 vm_offset_t
1884 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length)
1885 {
1886 vm_map_entry_t header, llist, rlist, root, y;
1887 vm_size_t left_length, max_free_left, max_free_right;
1888 vm_offset_t gap_end;
1889
1890 VM_MAP_ASSERT_LOCKED(map);
1891
1892 /*
1893 * Request must fit within min/max VM address and must avoid
1894 * address wrap.
1895 */
1896 start = MAX(start, vm_map_min(map));
1897 if (start >= vm_map_max(map) || length > vm_map_max(map) - start)
1898 return (vm_map_max(map) - length + 1);
1899
1900 /* Empty tree means wide open address space. */
1901 if (map->root == NULL)
1902 return (start);
1903
1904 /*
1905 * After splay_split, if start is within an entry, push it to the start
1906 * of the following gap. If rlist is at the end of the gap containing
1907 * start, save the end of that gap in gap_end to see if the gap is big
1908 * enough; otherwise set gap_end to start skip gap-checking and move
1909 * directly to a search of the right subtree.
1910 */
1911 header = &map->header;
1912 root = vm_map_splay_split(map, start, length, &llist, &rlist);
1913 gap_end = rlist->start;
1914 if (root != NULL) {
1915 start = root->end;
1916 if (root->right != rlist)
1917 gap_end = start;
1918 max_free_left = vm_map_splay_merge_left(header, root, llist);
1919 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1920 } else if (rlist != header) {
1921 root = rlist;
1922 rlist = root->left;
1923 max_free_left = vm_map_splay_merge_pred(header, root, llist);
1924 max_free_right = vm_map_splay_merge_right(header, root, rlist);
1925 } else {
1926 root = llist;
1927 llist = root->right;
1928 max_free_left = vm_map_splay_merge_left(header, root, llist);
1929 max_free_right = vm_map_splay_merge_succ(header, root, rlist);
1930 }
1931 root->max_free = vm_size_max(max_free_left, max_free_right);
1932 map->root = root;
1933 VM_MAP_ASSERT_CONSISTENT(map);
1934 if (length <= gap_end - start)
1935 return (start);
1936
1937 /* With max_free, can immediately tell if no solution. */
1938 if (root->right == header || length > root->right->max_free)
1939 return (vm_map_max(map) - length + 1);
1940
1941 /*
1942 * Splay for the least large-enough gap in the right subtree.
1943 */
1944 llist = rlist = header;
1945 for (left_length = 0;;
1946 left_length = vm_map_entry_max_free_left(root, llist)) {
1947 if (length <= left_length)
1948 SPLAY_LEFT_STEP(root, y, llist, rlist,
1949 length <= vm_map_entry_max_free_left(y, llist));
1950 else
1951 SPLAY_RIGHT_STEP(root, y, llist, rlist,
1952 length > vm_map_entry_max_free_left(y, root));
1953 if (root == NULL)
1954 break;
1955 }
1956 root = llist;
1957 llist = root->right;
1958 max_free_left = vm_map_splay_merge_left(header, root, llist);
1959 if (rlist == header) {
1960 root->max_free = vm_size_max(max_free_left,
1961 vm_map_splay_merge_succ(header, root, rlist));
1962 } else {
1963 y = rlist;
1964 rlist = y->left;
1965 y->max_free = vm_size_max(
1966 vm_map_splay_merge_pred(root, y, root),
1967 vm_map_splay_merge_right(header, y, rlist));
1968 root->max_free = vm_size_max(max_free_left, y->max_free);
1969 }
1970 map->root = root;
1971 VM_MAP_ASSERT_CONSISTENT(map);
1972 return (root->end);
1973 }
1974
1975 int
1976 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
1977 vm_offset_t start, vm_size_t length, vm_prot_t prot,
1978 vm_prot_t max, int cow)
1979 {
1980 vm_offset_t end;
1981 int result;
1982
1983 end = start + length;
1984 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
1985 object == NULL,
1986 ("vm_map_fixed: non-NULL backing object for stack"));
1987 vm_map_lock(map);
1988 VM_MAP_RANGE_CHECK(map, start, end);
1989 if ((cow & MAP_CHECK_EXCL) == 0) {
1990 result = vm_map_delete(map, start, end);
1991 if (result != KERN_SUCCESS)
1992 goto out;
1993 }
1994 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
1995 result = vm_map_stack_locked(map, start, length, sgrowsiz,
1996 prot, max, cow);
1997 } else {
1998 result = vm_map_insert(map, object, offset, start, end,
1999 prot, max, cow);
2000 }
2001 out:
2002 vm_map_unlock(map);
2003 return (result);
2004 }
2005
2006 static const int aslr_pages_rnd_64[2] = {0x1000, 0x10};
2007 static const int aslr_pages_rnd_32[2] = {0x100, 0x4};
2008
2009 static int cluster_anon = 1;
2010 SYSCTL_INT(_vm, OID_AUTO, cluster_anon, CTLFLAG_RW,
2011 &cluster_anon, 0,
2012 "Cluster anonymous mappings: 0 = no, 1 = yes if no hint, 2 = always");
2013
2014 static bool
2015 clustering_anon_allowed(vm_offset_t addr)
2016 {
2017
2018 switch (cluster_anon) {
2019 case 0:
2020 return (false);
2021 case 1:
2022 return (addr == 0);
2023 case 2:
2024 default:
2025 return (true);
2026 }
2027 }
2028
2029 static long aslr_restarts;
2030 SYSCTL_LONG(_vm, OID_AUTO, aslr_restarts, CTLFLAG_RD,
2031 &aslr_restarts, 0,
2032 "Number of aslr failures");
2033
2034 /*
2035 * Searches for the specified amount of free space in the given map with the
2036 * specified alignment. Performs an address-ordered, first-fit search from
2037 * the given address "*addr", with an optional upper bound "max_addr". If the
2038 * parameter "alignment" is zero, then the alignment is computed from the
2039 * given (object, offset) pair so as to enable the greatest possible use of
2040 * superpage mappings. Returns KERN_SUCCESS and the address of the free space
2041 * in "*addr" if successful. Otherwise, returns KERN_NO_SPACE.
2042 *
2043 * The map must be locked. Initially, there must be at least "length" bytes
2044 * of free space at the given address.
2045 */
2046 static int
2047 vm_map_alignspace(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2048 vm_offset_t *addr, vm_size_t length, vm_offset_t max_addr,
2049 vm_offset_t alignment)
2050 {
2051 vm_offset_t aligned_addr, free_addr;
2052
2053 VM_MAP_ASSERT_LOCKED(map);
2054 free_addr = *addr;
2055 KASSERT(free_addr == vm_map_findspace(map, free_addr, length),
2056 ("caller failed to provide space %#jx at address %p",
2057 (uintmax_t)length, (void *)free_addr));
2058 for (;;) {
2059 /*
2060 * At the start of every iteration, the free space at address
2061 * "*addr" is at least "length" bytes.
2062 */
2063 if (alignment == 0)
2064 pmap_align_superpage(object, offset, addr, length);
2065 else if ((*addr & (alignment - 1)) != 0) {
2066 *addr &= ~(alignment - 1);
2067 *addr += alignment;
2068 }
2069 aligned_addr = *addr;
2070 if (aligned_addr == free_addr) {
2071 /*
2072 * Alignment did not change "*addr", so "*addr" must
2073 * still provide sufficient free space.
2074 */
2075 return (KERN_SUCCESS);
2076 }
2077
2078 /*
2079 * Test for address wrap on "*addr". A wrapped "*addr" could
2080 * be a valid address, in which case vm_map_findspace() cannot
2081 * be relied upon to fail.
2082 */
2083 if (aligned_addr < free_addr)
2084 return (KERN_NO_SPACE);
2085 *addr = vm_map_findspace(map, aligned_addr, length);
2086 if (*addr + length > vm_map_max(map) ||
2087 (max_addr != 0 && *addr + length > max_addr))
2088 return (KERN_NO_SPACE);
2089 free_addr = *addr;
2090 if (free_addr == aligned_addr) {
2091 /*
2092 * If a successful call to vm_map_findspace() did not
2093 * change "*addr", then "*addr" must still be aligned
2094 * and provide sufficient free space.
2095 */
2096 return (KERN_SUCCESS);
2097 }
2098 }
2099 }
2100
2101 int
2102 vm_map_find_aligned(vm_map_t map, vm_offset_t *addr, vm_size_t length,
2103 vm_offset_t max_addr, vm_offset_t alignment)
2104 {
2105 /* XXXKIB ASLR eh ? */
2106 *addr = vm_map_findspace(map, *addr, length);
2107 if (*addr + length > vm_map_max(map) ||
2108 (max_addr != 0 && *addr + length > max_addr))
2109 return (KERN_NO_SPACE);
2110 return (vm_map_alignspace(map, NULL, 0, addr, length, max_addr,
2111 alignment));
2112 }
2113
2114 /*
2115 * vm_map_find finds an unallocated region in the target address
2116 * map with the given length. The search is defined to be
2117 * first-fit from the specified address; the region found is
2118 * returned in the same parameter.
2119 *
2120 * If object is non-NULL, ref count must be bumped by caller
2121 * prior to making call to account for the new entry.
2122 */
2123 int
2124 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2125 vm_offset_t *addr, /* IN/OUT */
2126 vm_size_t length, vm_offset_t max_addr, int find_space,
2127 vm_prot_t prot, vm_prot_t max, int cow)
2128 {
2129 vm_offset_t alignment, curr_min_addr, min_addr;
2130 int gap, pidx, rv, try;
2131 bool cluster, en_aslr, update_anon;
2132
2133 KASSERT((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0 ||
2134 object == NULL,
2135 ("vm_map_find: non-NULL backing object for stack"));
2136 MPASS((cow & MAP_REMAP) == 0 || (find_space == VMFS_NO_SPACE &&
2137 (cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) == 0));
2138 if (find_space == VMFS_OPTIMAL_SPACE && (object == NULL ||
2139 (object->flags & OBJ_COLORED) == 0))
2140 find_space = VMFS_ANY_SPACE;
2141 if (find_space >> 8 != 0) {
2142 KASSERT((find_space & 0xff) == 0, ("bad VMFS flags"));
2143 alignment = (vm_offset_t)1 << (find_space >> 8);
2144 } else
2145 alignment = 0;
2146 en_aslr = (map->flags & MAP_ASLR) != 0;
2147 update_anon = cluster = clustering_anon_allowed(*addr) &&
2148 (map->flags & MAP_IS_SUB_MAP) == 0 && max_addr == 0 &&
2149 find_space != VMFS_NO_SPACE && object == NULL &&
2150 (cow & (MAP_INHERIT_SHARE | MAP_STACK_GROWS_UP |
2151 MAP_STACK_GROWS_DOWN)) == 0 && prot != PROT_NONE;
2152 curr_min_addr = min_addr = *addr;
2153 if (en_aslr && min_addr == 0 && !cluster &&
2154 find_space != VMFS_NO_SPACE &&
2155 (map->flags & MAP_ASLR_IGNSTART) != 0)
2156 curr_min_addr = min_addr = vm_map_min(map);
2157 try = 0;
2158 vm_map_lock(map);
2159 if (cluster) {
2160 curr_min_addr = map->anon_loc;
2161 if (curr_min_addr == 0)
2162 cluster = false;
2163 }
2164 if (find_space != VMFS_NO_SPACE) {
2165 KASSERT(find_space == VMFS_ANY_SPACE ||
2166 find_space == VMFS_OPTIMAL_SPACE ||
2167 find_space == VMFS_SUPER_SPACE ||
2168 alignment != 0, ("unexpected VMFS flag"));
2169 again:
2170 /*
2171 * When creating an anonymous mapping, try clustering
2172 * with an existing anonymous mapping first.
2173 *
2174 * We make up to two attempts to find address space
2175 * for a given find_space value. The first attempt may
2176 * apply randomization or may cluster with an existing
2177 * anonymous mapping. If this first attempt fails,
2178 * perform a first-fit search of the available address
2179 * space.
2180 *
2181 * If all tries failed, and find_space is
2182 * VMFS_OPTIMAL_SPACE, fallback to VMFS_ANY_SPACE.
2183 * Again enable clustering and randomization.
2184 */
2185 try++;
2186 MPASS(try <= 2);
2187
2188 if (try == 2) {
2189 /*
2190 * Second try: we failed either to find a
2191 * suitable region for randomizing the
2192 * allocation, or to cluster with an existing
2193 * mapping. Retry with free run.
2194 */
2195 curr_min_addr = (map->flags & MAP_ASLR_IGNSTART) != 0 ?
2196 vm_map_min(map) : min_addr;
2197 atomic_add_long(&aslr_restarts, 1);
2198 }
2199
2200 if (try == 1 && en_aslr && !cluster) {
2201 /*
2202 * Find space for allocation, including
2203 * gap needed for later randomization.
2204 */
2205 pidx = MAXPAGESIZES > 1 && pagesizes[1] != 0 &&
2206 (find_space == VMFS_SUPER_SPACE || find_space ==
2207 VMFS_OPTIMAL_SPACE) ? 1 : 0;
2208 gap = vm_map_max(map) > MAP_32BIT_MAX_ADDR &&
2209 (max_addr == 0 || max_addr > MAP_32BIT_MAX_ADDR) ?
2210 aslr_pages_rnd_64[pidx] : aslr_pages_rnd_32[pidx];
2211 *addr = vm_map_findspace(map, curr_min_addr,
2212 length + gap * pagesizes[pidx]);
2213 if (*addr + length + gap * pagesizes[pidx] >
2214 vm_map_max(map))
2215 goto again;
2216 /* And randomize the start address. */
2217 *addr += (arc4random() % gap) * pagesizes[pidx];
2218 if (max_addr != 0 && *addr + length > max_addr)
2219 goto again;
2220 } else {
2221 *addr = vm_map_findspace(map, curr_min_addr, length);
2222 if (*addr + length > vm_map_max(map) ||
2223 (max_addr != 0 && *addr + length > max_addr)) {
2224 if (cluster) {
2225 cluster = false;
2226 MPASS(try == 1);
2227 goto again;
2228 }
2229 rv = KERN_NO_SPACE;
2230 goto done;
2231 }
2232 }
2233
2234 if (find_space != VMFS_ANY_SPACE &&
2235 (rv = vm_map_alignspace(map, object, offset, addr, length,
2236 max_addr, alignment)) != KERN_SUCCESS) {
2237 if (find_space == VMFS_OPTIMAL_SPACE) {
2238 find_space = VMFS_ANY_SPACE;
2239 curr_min_addr = min_addr;
2240 cluster = update_anon;
2241 try = 0;
2242 goto again;
2243 }
2244 goto done;
2245 }
2246 } else if ((cow & MAP_REMAP) != 0) {
2247 if (!vm_map_range_valid(map, *addr, *addr + length)) {
2248 rv = KERN_INVALID_ADDRESS;
2249 goto done;
2250 }
2251 rv = vm_map_delete(map, *addr, *addr + length);
2252 if (rv != KERN_SUCCESS)
2253 goto done;
2254 }
2255 if ((cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP)) != 0) {
2256 rv = vm_map_stack_locked(map, *addr, length, sgrowsiz, prot,
2257 max, cow);
2258 } else {
2259 rv = vm_map_insert(map, object, offset, *addr, *addr + length,
2260 prot, max, cow);
2261 }
2262 if (rv == KERN_SUCCESS && update_anon)
2263 map->anon_loc = *addr + length;
2264 done:
2265 vm_map_unlock(map);
2266 return (rv);
2267 }
2268
2269 /*
2270 * vm_map_find_min() is a variant of vm_map_find() that takes an
2271 * additional parameter (min_addr) and treats the given address
2272 * (*addr) differently. Specifically, it treats *addr as a hint
2273 * and not as the minimum address where the mapping is created.
2274 *
2275 * This function works in two phases. First, it tries to
2276 * allocate above the hint. If that fails and the hint is
2277 * greater than min_addr, it performs a second pass, replacing
2278 * the hint with min_addr as the minimum address for the
2279 * allocation.
2280 */
2281 int
2282 vm_map_find_min(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
2283 vm_offset_t *addr, vm_size_t length, vm_offset_t min_addr,
2284 vm_offset_t max_addr, int find_space, vm_prot_t prot, vm_prot_t max,
2285 int cow)
2286 {
2287 vm_offset_t hint;
2288 int rv;
2289
2290 hint = *addr;
2291 for (;;) {
2292 rv = vm_map_find(map, object, offset, addr, length, max_addr,
2293 find_space, prot, max, cow);
2294 if (rv == KERN_SUCCESS || min_addr >= hint)
2295 return (rv);
2296 *addr = hint = min_addr;
2297 }
2298 }
2299
2300 /*
2301 * A map entry with any of the following flags set must not be merged with
2302 * another entry.
2303 */
2304 #define MAP_ENTRY_NOMERGE_MASK (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP | \
2305 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_VN_EXEC)
2306
2307 static bool
2308 vm_map_mergeable_neighbors(vm_map_entry_t prev, vm_map_entry_t entry)
2309 {
2310
2311 KASSERT((prev->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 ||
2312 (entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0,
2313 ("vm_map_mergeable_neighbors: neither %p nor %p are mergeable",
2314 prev, entry));
2315 return (prev->end == entry->start &&
2316 prev->object.vm_object == entry->object.vm_object &&
2317 (prev->object.vm_object == NULL ||
2318 prev->offset + (prev->end - prev->start) == entry->offset) &&
2319 prev->eflags == entry->eflags &&
2320 prev->protection == entry->protection &&
2321 prev->max_protection == entry->max_protection &&
2322 prev->inheritance == entry->inheritance &&
2323 prev->wired_count == entry->wired_count &&
2324 prev->cred == entry->cred);
2325 }
2326
2327 static void
2328 vm_map_merged_neighbor_dispose(vm_map_t map, vm_map_entry_t entry)
2329 {
2330
2331 /*
2332 * If the backing object is a vnode object, vm_object_deallocate()
2333 * calls vrele(). However, vrele() does not lock the vnode because
2334 * the vnode has additional references. Thus, the map lock can be
2335 * kept without causing a lock-order reversal with the vnode lock.
2336 *
2337 * Since we count the number of virtual page mappings in
2338 * object->un_pager.vnp.writemappings, the writemappings value
2339 * should not be adjusted when the entry is disposed of.
2340 */
2341 if (entry->object.vm_object != NULL)
2342 vm_object_deallocate(entry->object.vm_object);
2343 if (entry->cred != NULL)
2344 crfree(entry->cred);
2345 vm_map_entry_dispose(map, entry);
2346 }
2347
2348 /*
2349 * vm_map_try_merge_entries:
2350 *
2351 * Compare the given map entry to its predecessor, and merge its precessor
2352 * into it if possible. The entry remains valid, and may be extended.
2353 * The predecessor may be deleted.
2354 *
2355 * The map must be locked.
2356 */
2357 void
2358 vm_map_try_merge_entries(vm_map_t map, vm_map_entry_t prev_entry,
2359 vm_map_entry_t entry)
2360 {
2361
2362 VM_MAP_ASSERT_LOCKED(map);
2363 if ((entry->eflags & MAP_ENTRY_NOMERGE_MASK) == 0 &&
2364 vm_map_mergeable_neighbors(prev_entry, entry)) {
2365 vm_map_entry_unlink(map, prev_entry, UNLINK_MERGE_NEXT);
2366 vm_map_merged_neighbor_dispose(map, prev_entry);
2367 }
2368 }
2369
2370 /*
2371 * vm_map_entry_back:
2372 *
2373 * Allocate an object to back a map entry.
2374 */
2375 static inline void
2376 vm_map_entry_back(vm_map_entry_t entry)
2377 {
2378 vm_object_t object;
2379
2380 KASSERT(entry->object.vm_object == NULL,
2381 ("map entry %p has backing object", entry));
2382 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2383 ("map entry %p is a submap", entry));
2384 object = vm_object_allocate_anon(atop(entry->end - entry->start), NULL,
2385 entry->cred, entry->end - entry->start);
2386 entry->object.vm_object = object;
2387 entry->offset = 0;
2388 entry->cred = NULL;
2389 }
2390
2391 /*
2392 * vm_map_entry_charge_object
2393 *
2394 * If there is no object backing this entry, create one. Otherwise, if
2395 * the entry has cred, give it to the backing object.
2396 */
2397 static inline void
2398 vm_map_entry_charge_object(vm_map_t map, vm_map_entry_t entry)
2399 {
2400
2401 VM_MAP_ASSERT_LOCKED(map);
2402 KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
2403 ("map entry %p is a submap", entry));
2404 if (entry->object.vm_object == NULL && !map->system_map &&
2405 (entry->eflags & MAP_ENTRY_GUARD) == 0)
2406 vm_map_entry_back(entry);
2407 else if (entry->object.vm_object != NULL &&
2408 ((entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) &&
2409 entry->cred != NULL) {
2410 VM_OBJECT_WLOCK(entry->object.vm_object);
2411 KASSERT(entry->object.vm_object->cred == NULL,
2412 ("OVERCOMMIT: %s: both cred e %p", __func__, entry));
2413 entry->object.vm_object->cred = entry->cred;
2414 entry->object.vm_object->charge = entry->end - entry->start;
2415 VM_OBJECT_WUNLOCK(entry->object.vm_object);
2416 entry->cred = NULL;
2417 }
2418 }
2419
2420 /*
2421 * vm_map_entry_clone
2422 *
2423 * Create a duplicate map entry for clipping.
2424 */
2425 static vm_map_entry_t
2426 vm_map_entry_clone(vm_map_t map, vm_map_entry_t entry)
2427 {
2428 vm_map_entry_t new_entry;
2429
2430 VM_MAP_ASSERT_LOCKED(map);
2431
2432 /*
2433 * Create a backing object now, if none exists, so that more individual
2434 * objects won't be created after the map entry is split.
2435 */
2436 vm_map_entry_charge_object(map, entry);
2437
2438 /* Clone the entry. */
2439 new_entry = vm_map_entry_create(map);
2440 *new_entry = *entry;
2441 if (new_entry->cred != NULL)
2442 crhold(entry->cred);
2443 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
2444 vm_object_reference(new_entry->object.vm_object);
2445 vm_map_entry_set_vnode_text(new_entry, true);
2446 /*
2447 * The object->un_pager.vnp.writemappings for the object of
2448 * MAP_ENTRY_WRITECNT type entry shall be kept as is here. The
2449 * virtual pages are re-distributed among the clipped entries,
2450 * so the sum is left the same.
2451 */
2452 }
2453 return (new_entry);
2454 }
2455
2456 /*
2457 * vm_map_clip_start: [ internal use only ]
2458 *
2459 * Asserts that the given entry begins at or after
2460 * the specified address; if necessary,
2461 * it splits the entry into two.
2462 */
2463 static int
2464 vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t startaddr)
2465 {
2466 vm_map_entry_t new_entry;
2467 int bdry_idx;
2468
2469 if (!map->system_map)
2470 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2471 "%s: map %p entry %p start 0x%jx", __func__, map, entry,
2472 (uintmax_t)startaddr);
2473
2474 if (startaddr <= entry->start)
2475 return (KERN_SUCCESS);
2476
2477 VM_MAP_ASSERT_LOCKED(map);
2478 KASSERT(entry->end > startaddr && entry->start < startaddr,
2479 ("%s: invalid clip of entry %p", __func__, entry));
2480
2481 bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
2482 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
2483 if (bdry_idx != 0) {
2484 if ((startaddr & (pagesizes[bdry_idx] - 1)) != 0)
2485 return (KERN_INVALID_ARGUMENT);
2486 }
2487
2488 new_entry = vm_map_entry_clone(map, entry);
2489
2490 /*
2491 * Split off the front portion. Insert the new entry BEFORE this one,
2492 * so that this entry has the specified starting address.
2493 */
2494 new_entry->end = startaddr;
2495 vm_map_entry_link(map, new_entry);
2496 return (KERN_SUCCESS);
2497 }
2498
2499 /*
2500 * vm_map_lookup_clip_start:
2501 *
2502 * Find the entry at or just after 'start', and clip it if 'start' is in
2503 * the interior of the entry. Return entry after 'start', and in
2504 * prev_entry set the entry before 'start'.
2505 */
2506 static int
2507 vm_map_lookup_clip_start(vm_map_t map, vm_offset_t start,
2508 vm_map_entry_t *res_entry, vm_map_entry_t *prev_entry)
2509 {
2510 vm_map_entry_t entry;
2511 int rv;
2512
2513 if (!map->system_map)
2514 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2515 "%s: map %p start 0x%jx prev %p", __func__, map,
2516 (uintmax_t)start, prev_entry);
2517
2518 if (vm_map_lookup_entry(map, start, prev_entry)) {
2519 entry = *prev_entry;
2520 rv = vm_map_clip_start(map, entry, start);
2521 if (rv != KERN_SUCCESS)
2522 return (rv);
2523 *prev_entry = vm_map_entry_pred(entry);
2524 } else
2525 entry = vm_map_entry_succ(*prev_entry);
2526 *res_entry = entry;
2527 return (KERN_SUCCESS);
2528 }
2529
2530 /*
2531 * vm_map_clip_end: [ internal use only ]
2532 *
2533 * Asserts that the given entry ends at or before
2534 * the specified address; if necessary,
2535 * it splits the entry into two.
2536 */
2537 static int
2538 vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t endaddr)
2539 {
2540 vm_map_entry_t new_entry;
2541 int bdry_idx;
2542
2543 if (!map->system_map)
2544 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2545 "%s: map %p entry %p end 0x%jx", __func__, map, entry,
2546 (uintmax_t)endaddr);
2547
2548 if (endaddr >= entry->end)
2549 return (KERN_SUCCESS);
2550
2551 VM_MAP_ASSERT_LOCKED(map);
2552 KASSERT(entry->start < endaddr && entry->end > endaddr,
2553 ("%s: invalid clip of entry %p", __func__, entry));
2554
2555 bdry_idx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
2556 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
2557 if (bdry_idx != 0) {
2558 if ((endaddr & (pagesizes[bdry_idx] - 1)) != 0)
2559 return (KERN_INVALID_ARGUMENT);
2560 }
2561
2562 new_entry = vm_map_entry_clone(map, entry);
2563
2564 /*
2565 * Split off the back portion. Insert the new entry AFTER this one,
2566 * so that this entry has the specified ending address.
2567 */
2568 new_entry->start = endaddr;
2569 vm_map_entry_link(map, new_entry);
2570
2571 return (KERN_SUCCESS);
2572 }
2573
2574 /*
2575 * vm_map_submap: [ kernel use only ]
2576 *
2577 * Mark the given range as handled by a subordinate map.
2578 *
2579 * This range must have been created with vm_map_find,
2580 * and no other operations may have been performed on this
2581 * range prior to calling vm_map_submap.
2582 *
2583 * Only a limited number of operations can be performed
2584 * within this rage after calling vm_map_submap:
2585 * vm_fault
2586 * [Don't try vm_map_copy!]
2587 *
2588 * To remove a submapping, one must first remove the
2589 * range from the superior map, and then destroy the
2590 * submap (if desired). [Better yet, don't try it.]
2591 */
2592 int
2593 vm_map_submap(
2594 vm_map_t map,
2595 vm_offset_t start,
2596 vm_offset_t end,
2597 vm_map_t submap)
2598 {
2599 vm_map_entry_t entry;
2600 int result;
2601
2602 result = KERN_INVALID_ARGUMENT;
2603
2604 vm_map_lock(submap);
2605 submap->flags |= MAP_IS_SUB_MAP;
2606 vm_map_unlock(submap);
2607
2608 vm_map_lock(map);
2609 VM_MAP_RANGE_CHECK(map, start, end);
2610 if (vm_map_lookup_entry(map, start, &entry) && entry->end >= end &&
2611 (entry->eflags & MAP_ENTRY_COW) == 0 &&
2612 entry->object.vm_object == NULL) {
2613 result = vm_map_clip_start(map, entry, start);
2614 if (result != KERN_SUCCESS)
2615 goto unlock;
2616 result = vm_map_clip_end(map, entry, end);
2617 if (result != KERN_SUCCESS)
2618 goto unlock;
2619 entry->object.sub_map = submap;
2620 entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
2621 result = KERN_SUCCESS;
2622 }
2623 unlock:
2624 vm_map_unlock(map);
2625
2626 if (result != KERN_SUCCESS) {
2627 vm_map_lock(submap);
2628 submap->flags &= ~MAP_IS_SUB_MAP;
2629 vm_map_unlock(submap);
2630 }
2631 return (result);
2632 }
2633
2634 /*
2635 * The maximum number of pages to map if MAP_PREFAULT_PARTIAL is specified
2636 */
2637 #define MAX_INIT_PT 96
2638
2639 /*
2640 * vm_map_pmap_enter:
2641 *
2642 * Preload the specified map's pmap with mappings to the specified
2643 * object's memory-resident pages. No further physical pages are
2644 * allocated, and no further virtual pages are retrieved from secondary
2645 * storage. If the specified flags include MAP_PREFAULT_PARTIAL, then a
2646 * limited number of page mappings are created at the low-end of the
2647 * specified address range. (For this purpose, a superpage mapping
2648 * counts as one page mapping.) Otherwise, all resident pages within
2649 * the specified address range are mapped.
2650 */
2651 static void
2652 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
2653 vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
2654 {
2655 vm_offset_t start;
2656 vm_page_t p, p_start;
2657 vm_pindex_t mask, psize, threshold, tmpidx;
2658
2659 if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
2660 return;
2661 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2662 VM_OBJECT_WLOCK(object);
2663 if (object->type == OBJT_DEVICE || object->type == OBJT_SG) {
2664 pmap_object_init_pt(map->pmap, addr, object, pindex,
2665 size);
2666 VM_OBJECT_WUNLOCK(object);
2667 return;
2668 }
2669 VM_OBJECT_LOCK_DOWNGRADE(object);
2670 } else
2671 VM_OBJECT_RLOCK(object);
2672
2673 psize = atop(size);
2674 if (psize + pindex > object->size) {
2675 if (pindex >= object->size) {
2676 VM_OBJECT_RUNLOCK(object);
2677 return;
2678 }
2679 psize = object->size - pindex;
2680 }
2681
2682 start = 0;
2683 p_start = NULL;
2684 threshold = MAX_INIT_PT;
2685
2686 p = vm_page_find_least(object, pindex);
2687 /*
2688 * Assert: the variable p is either (1) the page with the
2689 * least pindex greater than or equal to the parameter pindex
2690 * or (2) NULL.
2691 */
2692 for (;
2693 p != NULL && (tmpidx = p->pindex - pindex) < psize;
2694 p = TAILQ_NEXT(p, listq)) {
2695 /*
2696 * don't allow an madvise to blow away our really
2697 * free pages allocating pv entries.
2698 */
2699 if (((flags & MAP_PREFAULT_MADVISE) != 0 &&
2700 vm_page_count_severe()) ||
2701 ((flags & MAP_PREFAULT_PARTIAL) != 0 &&
2702 tmpidx >= threshold)) {
2703 psize = tmpidx;
2704 break;
2705 }
2706 if (vm_page_all_valid(p)) {
2707 if (p_start == NULL) {
2708 start = addr + ptoa(tmpidx);
2709 p_start = p;
2710 }
2711 /* Jump ahead if a superpage mapping is possible. */
2712 if (p->psind > 0 && ((addr + ptoa(tmpidx)) &
2713 (pagesizes[p->psind] - 1)) == 0) {
2714 mask = atop(pagesizes[p->psind]) - 1;
2715 if (tmpidx + mask < psize &&
2716 vm_page_ps_test(p, PS_ALL_VALID, NULL)) {
2717 p += mask;
2718 threshold += mask;
2719 }
2720 }
2721 } else if (p_start != NULL) {
2722 pmap_enter_object(map->pmap, start, addr +
2723 ptoa(tmpidx), p_start, prot);
2724 p_start = NULL;
2725 }
2726 }
2727 if (p_start != NULL)
2728 pmap_enter_object(map->pmap, start, addr + ptoa(psize),
2729 p_start, prot);
2730 VM_OBJECT_RUNLOCK(object);
2731 }
2732
2733 /*
2734 * vm_map_protect:
2735 *
2736 * Sets the protection and/or the maximum protection of the
2737 * specified address region in the target map.
2738 */
2739 int
2740 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
2741 vm_prot_t new_prot, vm_prot_t new_maxprot, int flags)
2742 {
2743 vm_map_entry_t entry, first_entry, in_tran, prev_entry;
2744 vm_object_t obj;
2745 struct ucred *cred;
2746 vm_prot_t old_prot;
2747 int rv;
2748
2749 if (start == end)
2750 return (KERN_SUCCESS);
2751
2752 if ((flags & (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT)) ==
2753 (VM_MAP_PROTECT_SET_PROT | VM_MAP_PROTECT_SET_MAXPROT) &&
2754 (new_prot & new_maxprot) != new_prot)
2755 return (KERN_OUT_OF_BOUNDS);
2756
2757 again:
2758 in_tran = NULL;
2759 vm_map_lock(map);
2760
2761 if ((map->flags & MAP_WXORX) != 0 &&
2762 (flags & VM_MAP_PROTECT_SET_PROT) != 0 &&
2763 (new_prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE |
2764 VM_PROT_EXECUTE)) {
2765 vm_map_unlock(map);
2766 return (KERN_PROTECTION_FAILURE);
2767 }
2768
2769 /*
2770 * Ensure that we are not concurrently wiring pages. vm_map_wire() may
2771 * need to fault pages into the map and will drop the map lock while
2772 * doing so, and the VM object may end up in an inconsistent state if we
2773 * update the protection on the map entry in between faults.
2774 */
2775 vm_map_wait_busy(map);
2776
2777 VM_MAP_RANGE_CHECK(map, start, end);
2778
2779 if (!vm_map_lookup_entry(map, start, &first_entry))
2780 first_entry = vm_map_entry_succ(first_entry);
2781
2782 /*
2783 * Make a first pass to check for protection violations.
2784 */
2785 for (entry = first_entry; entry->start < end;
2786 entry = vm_map_entry_succ(entry)) {
2787 if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
2788 continue;
2789 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
2790 vm_map_unlock(map);
2791 return (KERN_INVALID_ARGUMENT);
2792 }
2793 if ((flags & VM_MAP_PROTECT_SET_PROT) == 0)
2794 new_prot = entry->protection;
2795 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) == 0)
2796 new_maxprot = entry->max_protection;
2797 if ((new_prot & entry->max_protection) != new_prot ||
2798 (new_maxprot & entry->max_protection) != new_maxprot) {
2799 vm_map_unlock(map);
2800 return (KERN_PROTECTION_FAILURE);
2801 }
2802 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0)
2803 in_tran = entry;
2804 }
2805
2806 /*
2807 * Postpone the operation until all in-transition map entries have
2808 * stabilized. An in-transition entry might already have its pages
2809 * wired and wired_count incremented, but not yet have its
2810 * MAP_ENTRY_USER_WIRED flag set. In which case, we would fail to call
2811 * vm_fault_copy_entry() in the final loop below.
2812 */
2813 if (in_tran != NULL) {
2814 in_tran->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
2815 vm_map_unlock_and_wait(map, 0);
2816 goto again;
2817 }
2818
2819 /*
2820 * Before changing the protections, try to reserve swap space for any
2821 * private (i.e., copy-on-write) mappings that are transitioning from
2822 * read-only to read/write access. If a reservation fails, break out
2823 * of this loop early and let the next loop simplify the entries, since
2824 * some may now be mergeable.
2825 */
2826 rv = vm_map_clip_start(map, first_entry, start);
2827 if (rv != KERN_SUCCESS) {
2828 vm_map_unlock(map);
2829 return (rv);
2830 }
2831 for (entry = first_entry; entry->start < end;
2832 entry = vm_map_entry_succ(entry)) {
2833 rv = vm_map_clip_end(map, entry, end);
2834 if (rv != KERN_SUCCESS) {
2835 vm_map_unlock(map);
2836 return (rv);
2837 }
2838
2839 if ((flags & VM_MAP_PROTECT_SET_PROT) == 0 ||
2840 ((new_prot & ~entry->protection) & VM_PROT_WRITE) == 0 ||
2841 ENTRY_CHARGED(entry) ||
2842 (entry->eflags & MAP_ENTRY_GUARD) != 0)
2843 continue;
2844
2845 cred = curthread->td_ucred;
2846 obj = entry->object.vm_object;
2847
2848 if (obj == NULL ||
2849 (entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0) {
2850 if (!swap_reserve(entry->end - entry->start)) {
2851 rv = KERN_RESOURCE_SHORTAGE;
2852 end = entry->end;
2853 break;
2854 }
2855 crhold(cred);
2856 entry->cred = cred;
2857 continue;
2858 }
2859
2860 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP)
2861 continue;
2862 VM_OBJECT_WLOCK(obj);
2863 if (obj->type != OBJT_DEFAULT && obj->type != OBJT_SWAP) {
2864 VM_OBJECT_WUNLOCK(obj);
2865 continue;
2866 }
2867
2868 /*
2869 * Charge for the whole object allocation now, since
2870 * we cannot distinguish between non-charged and
2871 * charged clipped mapping of the same object later.
2872 */
2873 KASSERT(obj->charge == 0,
2874 ("vm_map_protect: object %p overcharged (entry %p)",
2875 obj, entry));
2876 if (!swap_reserve(ptoa(obj->size))) {
2877 VM_OBJECT_WUNLOCK(obj);
2878 rv = KERN_RESOURCE_SHORTAGE;
2879 end = entry->end;
2880 break;
2881 }
2882
2883 crhold(cred);
2884 obj->cred = cred;
2885 obj->charge = ptoa(obj->size);
2886 VM_OBJECT_WUNLOCK(obj);
2887 }
2888
2889 /*
2890 * If enough swap space was available, go back and fix up protections.
2891 * Otherwise, just simplify entries, since some may have been modified.
2892 * [Note that clipping is not necessary the second time.]
2893 */
2894 for (prev_entry = vm_map_entry_pred(first_entry), entry = first_entry;
2895 entry->start < end;
2896 vm_map_try_merge_entries(map, prev_entry, entry),
2897 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
2898 if (rv != KERN_SUCCESS ||
2899 (entry->eflags & MAP_ENTRY_GUARD) != 0)
2900 continue;
2901
2902 old_prot = entry->protection;
2903
2904 if ((flags & VM_MAP_PROTECT_SET_MAXPROT) != 0) {
2905 entry->max_protection = new_maxprot;
2906 entry->protection = new_maxprot & old_prot;
2907 }
2908 if ((flags & VM_MAP_PROTECT_SET_PROT) != 0)
2909 entry->protection = new_prot;
2910
2911 /*
2912 * For user wired map entries, the normal lazy evaluation of
2913 * write access upgrades through soft page faults is
2914 * undesirable. Instead, immediately copy any pages that are
2915 * copy-on-write and enable write access in the physical map.
2916 */
2917 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0 &&
2918 (entry->protection & VM_PROT_WRITE) != 0 &&
2919 (old_prot & VM_PROT_WRITE) == 0)
2920 vm_fault_copy_entry(map, map, entry, entry, NULL);
2921
2922 /*
2923 * When restricting access, update the physical map. Worry
2924 * about copy-on-write here.
2925 */
2926 if ((old_prot & ~entry->protection) != 0) {
2927 #define MASK(entry) (((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
2928 VM_PROT_ALL)
2929 pmap_protect(map->pmap, entry->start,
2930 entry->end,
2931 entry->protection & MASK(entry));
2932 #undef MASK
2933 }
2934 }
2935 vm_map_try_merge_entries(map, prev_entry, entry);
2936 vm_map_unlock(map);
2937 return (rv);
2938 }
2939
2940 /*
2941 * vm_map_madvise:
2942 *
2943 * This routine traverses a processes map handling the madvise
2944 * system call. Advisories are classified as either those effecting
2945 * the vm_map_entry structure, or those effecting the underlying
2946 * objects.
2947 */
2948 int
2949 vm_map_madvise(
2950 vm_map_t map,
2951 vm_offset_t start,
2952 vm_offset_t end,
2953 int behav)
2954 {
2955 vm_map_entry_t entry, prev_entry;
2956 int rv;
2957 bool modify_map;
2958
2959 /*
2960 * Some madvise calls directly modify the vm_map_entry, in which case
2961 * we need to use an exclusive lock on the map and we need to perform
2962 * various clipping operations. Otherwise we only need a read-lock
2963 * on the map.
2964 */
2965 switch(behav) {
2966 case MADV_NORMAL:
2967 case MADV_SEQUENTIAL:
2968 case MADV_RANDOM:
2969 case MADV_NOSYNC:
2970 case MADV_AUTOSYNC:
2971 case MADV_NOCORE:
2972 case MADV_CORE:
2973 if (start == end)
2974 return (0);
2975 modify_map = true;
2976 vm_map_lock(map);
2977 break;
2978 case MADV_WILLNEED:
2979 case MADV_DONTNEED:
2980 case MADV_FREE:
2981 if (start == end)
2982 return (0);
2983 modify_map = false;
2984 vm_map_lock_read(map);
2985 break;
2986 default:
2987 return (EINVAL);
2988 }
2989
2990 /*
2991 * Locate starting entry and clip if necessary.
2992 */
2993 VM_MAP_RANGE_CHECK(map, start, end);
2994
2995 if (modify_map) {
2996 /*
2997 * madvise behaviors that are implemented in the vm_map_entry.
2998 *
2999 * We clip the vm_map_entry so that behavioral changes are
3000 * limited to the specified address range.
3001 */
3002 rv = vm_map_lookup_clip_start(map, start, &entry, &prev_entry);
3003 if (rv != KERN_SUCCESS) {
3004 vm_map_unlock(map);
3005 return (vm_mmap_to_errno(rv));
3006 }
3007
3008 for (; entry->start < end; prev_entry = entry,
3009 entry = vm_map_entry_succ(entry)) {
3010 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3011 continue;
3012
3013 rv = vm_map_clip_end(map, entry, end);
3014 if (rv != KERN_SUCCESS) {
3015 vm_map_unlock(map);
3016 return (vm_mmap_to_errno(rv));
3017 }
3018
3019 switch (behav) {
3020 case MADV_NORMAL:
3021 vm_map_entry_set_behavior(entry,
3022 MAP_ENTRY_BEHAV_NORMAL);
3023 break;
3024 case MADV_SEQUENTIAL:
3025 vm_map_entry_set_behavior(entry,
3026 MAP_ENTRY_BEHAV_SEQUENTIAL);
3027 break;
3028 case MADV_RANDOM:
3029 vm_map_entry_set_behavior(entry,
3030 MAP_ENTRY_BEHAV_RANDOM);
3031 break;
3032 case MADV_NOSYNC:
3033 entry->eflags |= MAP_ENTRY_NOSYNC;
3034 break;
3035 case MADV_AUTOSYNC:
3036 entry->eflags &= ~MAP_ENTRY_NOSYNC;
3037 break;
3038 case MADV_NOCORE:
3039 entry->eflags |= MAP_ENTRY_NOCOREDUMP;
3040 break;
3041 case MADV_CORE:
3042 entry->eflags &= ~MAP_ENTRY_NOCOREDUMP;
3043 break;
3044 default:
3045 break;
3046 }
3047 vm_map_try_merge_entries(map, prev_entry, entry);
3048 }
3049 vm_map_try_merge_entries(map, prev_entry, entry);
3050 vm_map_unlock(map);
3051 } else {
3052 vm_pindex_t pstart, pend;
3053
3054 /*
3055 * madvise behaviors that are implemented in the underlying
3056 * vm_object.
3057 *
3058 * Since we don't clip the vm_map_entry, we have to clip
3059 * the vm_object pindex and count.
3060 */
3061 if (!vm_map_lookup_entry(map, start, &entry))
3062 entry = vm_map_entry_succ(entry);
3063 for (; entry->start < end;
3064 entry = vm_map_entry_succ(entry)) {
3065 vm_offset_t useEnd, useStart;
3066
3067 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
3068 continue;
3069
3070 /*
3071 * MADV_FREE would otherwise rewind time to
3072 * the creation of the shadow object. Because
3073 * we hold the VM map read-locked, neither the
3074 * entry's object nor the presence of a
3075 * backing object can change.
3076 */
3077 if (behav == MADV_FREE &&
3078 entry->object.vm_object != NULL &&
3079 entry->object.vm_object->backing_object != NULL)
3080 continue;
3081
3082 pstart = OFF_TO_IDX(entry->offset);
3083 pend = pstart + atop(entry->end - entry->start);
3084 useStart = entry->start;
3085 useEnd = entry->end;
3086
3087 if (entry->start < start) {
3088 pstart += atop(start - entry->start);
3089 useStart = start;
3090 }
3091 if (entry->end > end) {
3092 pend -= atop(entry->end - end);
3093 useEnd = end;
3094 }
3095
3096 if (pstart >= pend)
3097 continue;
3098
3099 /*
3100 * Perform the pmap_advise() before clearing
3101 * PGA_REFERENCED in vm_page_advise(). Otherwise, a
3102 * concurrent pmap operation, such as pmap_remove(),
3103 * could clear a reference in the pmap and set
3104 * PGA_REFERENCED on the page before the pmap_advise()
3105 * had completed. Consequently, the page would appear
3106 * referenced based upon an old reference that
3107 * occurred before this pmap_advise() ran.
3108 */
3109 if (behav == MADV_DONTNEED || behav == MADV_FREE)
3110 pmap_advise(map->pmap, useStart, useEnd,
3111 behav);
3112
3113 vm_object_madvise(entry->object.vm_object, pstart,
3114 pend, behav);
3115
3116 /*
3117 * Pre-populate paging structures in the
3118 * WILLNEED case. For wired entries, the
3119 * paging structures are already populated.
3120 */
3121 if (behav == MADV_WILLNEED &&
3122 entry->wired_count == 0) {
3123 vm_map_pmap_enter(map,
3124 useStart,
3125 entry->protection,
3126 entry->object.vm_object,
3127 pstart,
3128 ptoa(pend - pstart),
3129 MAP_PREFAULT_MADVISE
3130 );
3131 }
3132 }
3133 vm_map_unlock_read(map);
3134 }
3135 return (0);
3136 }
3137
3138 /*
3139 * vm_map_inherit:
3140 *
3141 * Sets the inheritance of the specified address
3142 * range in the target map. Inheritance
3143 * affects how the map will be shared with
3144 * child maps at the time of vmspace_fork.
3145 */
3146 int
3147 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
3148 vm_inherit_t new_inheritance)
3149 {
3150 vm_map_entry_t entry, lentry, prev_entry, start_entry;
3151 int rv;
3152
3153 switch (new_inheritance) {
3154 case VM_INHERIT_NONE:
3155 case VM_INHERIT_COPY:
3156 case VM_INHERIT_SHARE:
3157 case VM_INHERIT_ZERO:
3158 break;
3159 default:
3160 return (KERN_INVALID_ARGUMENT);
3161 }
3162 if (start == end)
3163 return (KERN_SUCCESS);
3164 vm_map_lock(map);
3165 VM_MAP_RANGE_CHECK(map, start, end);
3166 rv = vm_map_lookup_clip_start(map, start, &start_entry, &prev_entry);
3167 if (rv != KERN_SUCCESS)
3168 goto unlock;
3169 if (vm_map_lookup_entry(map, end - 1, &lentry)) {
3170 rv = vm_map_clip_end(map, lentry, end);
3171 if (rv != KERN_SUCCESS)
3172 goto unlock;
3173 }
3174 if (new_inheritance == VM_INHERIT_COPY) {
3175 for (entry = start_entry; entry->start < end;
3176 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3177 if ((entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3178 != 0) {
3179 rv = KERN_INVALID_ARGUMENT;
3180 goto unlock;
3181 }
3182 }
3183 }
3184 for (entry = start_entry; entry->start < end; prev_entry = entry,
3185 entry = vm_map_entry_succ(entry)) {
3186 KASSERT(entry->end <= end, ("non-clipped entry %p end %jx %jx",
3187 entry, (uintmax_t)entry->end, (uintmax_t)end));
3188 if ((entry->eflags & MAP_ENTRY_GUARD) == 0 ||
3189 new_inheritance != VM_INHERIT_ZERO)
3190 entry->inheritance = new_inheritance;
3191 vm_map_try_merge_entries(map, prev_entry, entry);
3192 }
3193 vm_map_try_merge_entries(map, prev_entry, entry);
3194 unlock:
3195 vm_map_unlock(map);
3196 return (rv);
3197 }
3198
3199 /*
3200 * vm_map_entry_in_transition:
3201 *
3202 * Release the map lock, and sleep until the entry is no longer in
3203 * transition. Awake and acquire the map lock. If the map changed while
3204 * another held the lock, lookup a possibly-changed entry at or after the
3205 * 'start' position of the old entry.
3206 */
3207 static vm_map_entry_t
3208 vm_map_entry_in_transition(vm_map_t map, vm_offset_t in_start,
3209 vm_offset_t *io_end, bool holes_ok, vm_map_entry_t in_entry)
3210 {
3211 vm_map_entry_t entry;
3212 vm_offset_t start;
3213 u_int last_timestamp;
3214
3215 VM_MAP_ASSERT_LOCKED(map);
3216 KASSERT((in_entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3217 ("not in-tranition map entry %p", in_entry));
3218 /*
3219 * We have not yet clipped the entry.
3220 */
3221 start = MAX(in_start, in_entry->start);
3222 in_entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3223 last_timestamp = map->timestamp;
3224 if (vm_map_unlock_and_wait(map, 0)) {
3225 /*
3226 * Allow interruption of user wiring/unwiring?
3227 */
3228 }
3229 vm_map_lock(map);
3230 if (last_timestamp + 1 == map->timestamp)
3231 return (in_entry);
3232
3233 /*
3234 * Look again for the entry because the map was modified while it was
3235 * unlocked. Specifically, the entry may have been clipped, merged, or
3236 * deleted.
3237 */
3238 if (!vm_map_lookup_entry(map, start, &entry)) {
3239 if (!holes_ok) {
3240 *io_end = start;
3241 return (NULL);
3242 }
3243 entry = vm_map_entry_succ(entry);
3244 }
3245 return (entry);
3246 }
3247
3248 /*
3249 * vm_map_unwire:
3250 *
3251 * Implements both kernel and user unwiring.
3252 */
3253 int
3254 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
3255 int flags)
3256 {
3257 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3258 int rv;
3259 bool holes_ok, need_wakeup, user_unwire;
3260
3261 if (start == end)
3262 return (KERN_SUCCESS);
3263 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3264 user_unwire = (flags & VM_MAP_WIRE_USER) != 0;
3265 vm_map_lock(map);
3266 VM_MAP_RANGE_CHECK(map, start, end);
3267 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3268 if (holes_ok)
3269 first_entry = vm_map_entry_succ(first_entry);
3270 else {
3271 vm_map_unlock(map);
3272 return (KERN_INVALID_ADDRESS);
3273 }
3274 }
3275 rv = KERN_SUCCESS;
3276 for (entry = first_entry; entry->start < end; entry = next_entry) {
3277 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3278 /*
3279 * We have not yet clipped the entry.
3280 */
3281 next_entry = vm_map_entry_in_transition(map, start,
3282 &end, holes_ok, entry);
3283 if (next_entry == NULL) {
3284 if (entry == first_entry) {
3285 vm_map_unlock(map);
3286 return (KERN_INVALID_ADDRESS);
3287 }
3288 rv = KERN_INVALID_ADDRESS;
3289 break;
3290 }
3291 first_entry = (entry == first_entry) ?
3292 next_entry : NULL;
3293 continue;
3294 }
3295 rv = vm_map_clip_start(map, entry, start);
3296 if (rv != KERN_SUCCESS)
3297 break;
3298 rv = vm_map_clip_end(map, entry, end);
3299 if (rv != KERN_SUCCESS)
3300 break;
3301
3302 /*
3303 * Mark the entry in case the map lock is released. (See
3304 * above.)
3305 */
3306 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3307 entry->wiring_thread == NULL,
3308 ("owned map entry %p", entry));
3309 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3310 entry->wiring_thread = curthread;
3311 next_entry = vm_map_entry_succ(entry);
3312 /*
3313 * Check the map for holes in the specified region.
3314 * If holes_ok, skip this check.
3315 */
3316 if (!holes_ok &&
3317 entry->end < end && next_entry->start > entry->end) {
3318 end = entry->end;
3319 rv = KERN_INVALID_ADDRESS;
3320 break;
3321 }
3322 /*
3323 * If system unwiring, require that the entry is system wired.
3324 */
3325 if (!user_unwire &&
3326 vm_map_entry_system_wired_count(entry) == 0) {
3327 end = entry->end;
3328 rv = KERN_INVALID_ARGUMENT;
3329 break;
3330 }
3331 }
3332 need_wakeup = false;
3333 if (first_entry == NULL &&
3334 !vm_map_lookup_entry(map, start, &first_entry)) {
3335 KASSERT(holes_ok, ("vm_map_unwire: lookup failed"));
3336 prev_entry = first_entry;
3337 entry = vm_map_entry_succ(first_entry);
3338 } else {
3339 prev_entry = vm_map_entry_pred(first_entry);
3340 entry = first_entry;
3341 }
3342 for (; entry->start < end;
3343 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3344 /*
3345 * If holes_ok was specified, an empty
3346 * space in the unwired region could have been mapped
3347 * while the map lock was dropped for draining
3348 * MAP_ENTRY_IN_TRANSITION. Moreover, another thread
3349 * could be simultaneously wiring this new mapping
3350 * entry. Detect these cases and skip any entries
3351 * marked as in transition by us.
3352 */
3353 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3354 entry->wiring_thread != curthread) {
3355 KASSERT(holes_ok,
3356 ("vm_map_unwire: !HOLESOK and new/changed entry"));
3357 continue;
3358 }
3359
3360 if (rv == KERN_SUCCESS && (!user_unwire ||
3361 (entry->eflags & MAP_ENTRY_USER_WIRED))) {
3362 if (entry->wired_count == 1)
3363 vm_map_entry_unwire(map, entry);
3364 else
3365 entry->wired_count--;
3366 if (user_unwire)
3367 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
3368 }
3369 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3370 ("vm_map_unwire: in-transition flag missing %p", entry));
3371 KASSERT(entry->wiring_thread == curthread,
3372 ("vm_map_unwire: alien wire %p", entry));
3373 entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
3374 entry->wiring_thread = NULL;
3375 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3376 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3377 need_wakeup = true;
3378 }
3379 vm_map_try_merge_entries(map, prev_entry, entry);
3380 }
3381 vm_map_try_merge_entries(map, prev_entry, entry);
3382 vm_map_unlock(map);
3383 if (need_wakeup)
3384 vm_map_wakeup(map);
3385 return (rv);
3386 }
3387
3388 static void
3389 vm_map_wire_user_count_sub(u_long npages)
3390 {
3391
3392 atomic_subtract_long(&vm_user_wire_count, npages);
3393 }
3394
3395 static bool
3396 vm_map_wire_user_count_add(u_long npages)
3397 {
3398 u_long wired;
3399
3400 wired = vm_user_wire_count;
3401 do {
3402 if (npages + wired > vm_page_max_user_wired)
3403 return (false);
3404 } while (!atomic_fcmpset_long(&vm_user_wire_count, &wired,
3405 npages + wired));
3406
3407 return (true);
3408 }
3409
3410 /*
3411 * vm_map_wire_entry_failure:
3412 *
3413 * Handle a wiring failure on the given entry.
3414 *
3415 * The map should be locked.
3416 */
3417 static void
3418 vm_map_wire_entry_failure(vm_map_t map, vm_map_entry_t entry,
3419 vm_offset_t failed_addr)
3420 {
3421
3422 VM_MAP_ASSERT_LOCKED(map);
3423 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 &&
3424 entry->wired_count == 1,
3425 ("vm_map_wire_entry_failure: entry %p isn't being wired", entry));
3426 KASSERT(failed_addr < entry->end,
3427 ("vm_map_wire_entry_failure: entry %p was fully wired", entry));
3428
3429 /*
3430 * If any pages at the start of this entry were successfully wired,
3431 * then unwire them.
3432 */
3433 if (failed_addr > entry->start) {
3434 pmap_unwire(map->pmap, entry->start, failed_addr);
3435 vm_object_unwire(entry->object.vm_object, entry->offset,
3436 failed_addr - entry->start, PQ_ACTIVE);
3437 }
3438
3439 /*
3440 * Assign an out-of-range value to represent the failure to wire this
3441 * entry.
3442 */
3443 entry->wired_count = -1;
3444 }
3445
3446 int
3447 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3448 {
3449 int rv;
3450
3451 vm_map_lock(map);
3452 rv = vm_map_wire_locked(map, start, end, flags);
3453 vm_map_unlock(map);
3454 return (rv);
3455 }
3456
3457 /*
3458 * vm_map_wire_locked:
3459 *
3460 * Implements both kernel and user wiring. Returns with the map locked,
3461 * the map lock may be dropped.
3462 */
3463 int
3464 vm_map_wire_locked(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags)
3465 {
3466 vm_map_entry_t entry, first_entry, next_entry, prev_entry;
3467 vm_offset_t faddr, saved_end, saved_start;
3468 u_long incr, npages;
3469 u_int bidx, last_timestamp;
3470 int rv;
3471 bool holes_ok, need_wakeup, user_wire;
3472 vm_prot_t prot;
3473
3474 VM_MAP_ASSERT_LOCKED(map);
3475
3476 if (start == end)
3477 return (KERN_SUCCESS);
3478 prot = 0;
3479 if (flags & VM_MAP_WIRE_WRITE)
3480 prot |= VM_PROT_WRITE;
3481 holes_ok = (flags & VM_MAP_WIRE_HOLESOK) != 0;
3482 user_wire = (flags & VM_MAP_WIRE_USER) != 0;
3483 VM_MAP_RANGE_CHECK(map, start, end);
3484 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3485 if (holes_ok)
3486 first_entry = vm_map_entry_succ(first_entry);
3487 else
3488 return (KERN_INVALID_ADDRESS);
3489 }
3490 for (entry = first_entry; entry->start < end; entry = next_entry) {
3491 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
3492 /*
3493 * We have not yet clipped the entry.
3494 */
3495 next_entry = vm_map_entry_in_transition(map, start,
3496 &end, holes_ok, entry);
3497 if (next_entry == NULL) {
3498 if (entry == first_entry)
3499 return (KERN_INVALID_ADDRESS);
3500 rv = KERN_INVALID_ADDRESS;
3501 goto done;
3502 }
3503 first_entry = (entry == first_entry) ?
3504 next_entry : NULL;
3505 continue;
3506 }
3507 rv = vm_map_clip_start(map, entry, start);
3508 if (rv != KERN_SUCCESS)
3509 goto done;
3510 rv = vm_map_clip_end(map, entry, end);
3511 if (rv != KERN_SUCCESS)
3512 goto done;
3513
3514 /*
3515 * Mark the entry in case the map lock is released. (See
3516 * above.)
3517 */
3518 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 &&
3519 entry->wiring_thread == NULL,
3520 ("owned map entry %p", entry));
3521 entry->eflags |= MAP_ENTRY_IN_TRANSITION;
3522 entry->wiring_thread = curthread;
3523 if ((entry->protection & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0
3524 || (entry->protection & prot) != prot) {
3525 entry->eflags |= MAP_ENTRY_WIRE_SKIPPED;
3526 if (!holes_ok) {
3527 end = entry->end;
3528 rv = KERN_INVALID_ADDRESS;
3529 goto done;
3530 }
3531 } else if (entry->wired_count == 0) {
3532 entry->wired_count++;
3533
3534 npages = atop(entry->end - entry->start);
3535 if (user_wire && !vm_map_wire_user_count_add(npages)) {
3536 vm_map_wire_entry_failure(map, entry,
3537 entry->start);
3538 end = entry->end;
3539 rv = KERN_RESOURCE_SHORTAGE;
3540 goto done;
3541 }
3542
3543 /*
3544 * Release the map lock, relying on the in-transition
3545 * mark. Mark the map busy for fork.
3546 */
3547 saved_start = entry->start;
3548 saved_end = entry->end;
3549 last_timestamp = map->timestamp;
3550 bidx = (entry->eflags & MAP_ENTRY_SPLIT_BOUNDARY_MASK)
3551 >> MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
3552 incr = pagesizes[bidx];
3553 vm_map_busy(map);
3554 vm_map_unlock(map);
3555
3556 for (faddr = saved_start; faddr < saved_end;
3557 faddr += incr) {
3558 /*
3559 * Simulate a fault to get the page and enter
3560 * it into the physical map.
3561 */
3562 rv = vm_fault(map, faddr, VM_PROT_NONE,
3563 VM_FAULT_WIRE, NULL);
3564 if (rv != KERN_SUCCESS)
3565 break;
3566 }
3567 vm_map_lock(map);
3568 vm_map_unbusy(map);
3569 if (last_timestamp + 1 != map->timestamp) {
3570 /*
3571 * Look again for the entry because the map was
3572 * modified while it was unlocked. The entry
3573 * may have been clipped, but NOT merged or
3574 * deleted.
3575 */
3576 if (!vm_map_lookup_entry(map, saved_start,
3577 &next_entry))
3578 KASSERT(false,
3579 ("vm_map_wire: lookup failed"));
3580 first_entry = (entry == first_entry) ?
3581 next_entry : NULL;
3582 for (entry = next_entry; entry->end < saved_end;
3583 entry = vm_map_entry_succ(entry)) {
3584 /*
3585 * In case of failure, handle entries
3586 * that were not fully wired here;
3587 * fully wired entries are handled
3588 * later.
3589 */
3590 if (rv != KERN_SUCCESS &&
3591 faddr < entry->end)
3592 vm_map_wire_entry_failure(map,
3593 entry, faddr);
3594 }
3595 }
3596 if (rv != KERN_SUCCESS) {
3597 vm_map_wire_entry_failure(map, entry, faddr);
3598 if (user_wire)
3599 vm_map_wire_user_count_sub(npages);
3600 end = entry->end;
3601 goto done;
3602 }
3603 } else if (!user_wire ||
3604 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3605 entry->wired_count++;
3606 }
3607 /*
3608 * Check the map for holes in the specified region.
3609 * If holes_ok was specified, skip this check.
3610 */
3611 next_entry = vm_map_entry_succ(entry);
3612 if (!holes_ok &&
3613 entry->end < end && next_entry->start > entry->end) {
3614 end = entry->end;
3615 rv = KERN_INVALID_ADDRESS;
3616 goto done;
3617 }
3618 }
3619 rv = KERN_SUCCESS;
3620 done:
3621 need_wakeup = false;
3622 if (first_entry == NULL &&
3623 !vm_map_lookup_entry(map, start, &first_entry)) {
3624 KASSERT(holes_ok, ("vm_map_wire: lookup failed"));
3625 prev_entry = first_entry;
3626 entry = vm_map_entry_succ(first_entry);
3627 } else {
3628 prev_entry = vm_map_entry_pred(first_entry);
3629 entry = first_entry;
3630 }
3631 for (; entry->start < end;
3632 prev_entry = entry, entry = vm_map_entry_succ(entry)) {
3633 /*
3634 * If holes_ok was specified, an empty
3635 * space in the unwired region could have been mapped
3636 * while the map lock was dropped for faulting in the
3637 * pages or draining MAP_ENTRY_IN_TRANSITION.
3638 * Moreover, another thread could be simultaneously
3639 * wiring this new mapping entry. Detect these cases
3640 * and skip any entries marked as in transition not by us.
3641 *
3642 * Another way to get an entry not marked with
3643 * MAP_ENTRY_IN_TRANSITION is after failed clipping,
3644 * which set rv to KERN_INVALID_ARGUMENT.
3645 */
3646 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) == 0 ||
3647 entry->wiring_thread != curthread) {
3648 KASSERT(holes_ok || rv == KERN_INVALID_ARGUMENT,
3649 ("vm_map_wire: !HOLESOK and new/changed entry"));
3650 continue;
3651 }
3652
3653 if ((entry->eflags & MAP_ENTRY_WIRE_SKIPPED) != 0) {
3654 /* do nothing */
3655 } else if (rv == KERN_SUCCESS) {
3656 if (user_wire)
3657 entry->eflags |= MAP_ENTRY_USER_WIRED;
3658 } else if (entry->wired_count == -1) {
3659 /*
3660 * Wiring failed on this entry. Thus, unwiring is
3661 * unnecessary.
3662 */
3663 entry->wired_count = 0;
3664 } else if (!user_wire ||
3665 (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
3666 /*
3667 * Undo the wiring. Wiring succeeded on this entry
3668 * but failed on a later entry.
3669 */
3670 if (entry->wired_count == 1) {
3671 vm_map_entry_unwire(map, entry);
3672 if (user_wire)
3673 vm_map_wire_user_count_sub(
3674 atop(entry->end - entry->start));
3675 } else
3676 entry->wired_count--;
3677 }
3678 KASSERT((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0,
3679 ("vm_map_wire: in-transition flag missing %p", entry));
3680 KASSERT(entry->wiring_thread == curthread,
3681 ("vm_map_wire: alien wire %p", entry));
3682 entry->eflags &= ~(MAP_ENTRY_IN_TRANSITION |
3683 MAP_ENTRY_WIRE_SKIPPED);
3684 entry->wiring_thread = NULL;
3685 if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
3686 entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
3687 need_wakeup = true;
3688 }
3689 vm_map_try_merge_entries(map, prev_entry, entry);
3690 }
3691 vm_map_try_merge_entries(map, prev_entry, entry);
3692 if (need_wakeup)
3693 vm_map_wakeup(map);
3694 return (rv);
3695 }
3696
3697 /*
3698 * vm_map_sync
3699 *
3700 * Push any dirty cached pages in the address range to their pager.
3701 * If syncio is TRUE, dirty pages are written synchronously.
3702 * If invalidate is TRUE, any cached pages are freed as well.
3703 *
3704 * If the size of the region from start to end is zero, we are
3705 * supposed to flush all modified pages within the region containing
3706 * start. Unfortunately, a region can be split or coalesced with
3707 * neighboring regions, making it difficult to determine what the
3708 * original region was. Therefore, we approximate this requirement by
3709 * flushing the current region containing start.
3710 *
3711 * Returns an error if any part of the specified range is not mapped.
3712 */
3713 int
3714 vm_map_sync(
3715 vm_map_t map,
3716 vm_offset_t start,
3717 vm_offset_t end,
3718 boolean_t syncio,
3719 boolean_t invalidate)
3720 {
3721 vm_map_entry_t entry, first_entry, next_entry;
3722 vm_size_t size;
3723 vm_object_t object;
3724 vm_ooffset_t offset;
3725 unsigned int last_timestamp;
3726 int bdry_idx;
3727 boolean_t failed;
3728
3729 vm_map_lock_read(map);
3730 VM_MAP_RANGE_CHECK(map, start, end);
3731 if (!vm_map_lookup_entry(map, start, &first_entry)) {
3732 vm_map_unlock_read(map);
3733 return (KERN_INVALID_ADDRESS);
3734 } else if (start == end) {
3735 start = first_entry->start;
3736 end = first_entry->end;
3737 }
3738
3739 /*
3740 * Make a first pass to check for user-wired memory, holes,
3741 * and partial invalidation of largepage mappings.
3742 */
3743 for (entry = first_entry; entry->start < end; entry = next_entry) {
3744 if (invalidate) {
3745 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0) {
3746 vm_map_unlock_read(map);
3747 return (KERN_INVALID_ARGUMENT);
3748 }
3749 bdry_idx = (entry->eflags &
3750 MAP_ENTRY_SPLIT_BOUNDARY_MASK) >>
3751 MAP_ENTRY_SPLIT_BOUNDARY_SHIFT;
3752 if (bdry_idx != 0 &&
3753 ((start & (pagesizes[bdry_idx] - 1)) != 0 ||
3754 (end & (pagesizes[bdry_idx] - 1)) != 0)) {
3755 vm_map_unlock_read(map);
3756 return (KERN_INVALID_ARGUMENT);
3757 }
3758 }
3759 next_entry = vm_map_entry_succ(entry);
3760 if (end > entry->end &&
3761 entry->end != next_entry->start) {
3762 vm_map_unlock_read(map);
3763 return (KERN_INVALID_ADDRESS);
3764 }
3765 }
3766
3767 if (invalidate)
3768 pmap_remove(map->pmap, start, end);
3769 failed = FALSE;
3770
3771 /*
3772 * Make a second pass, cleaning/uncaching pages from the indicated
3773 * objects as we go.
3774 */
3775 for (entry = first_entry; entry->start < end;) {
3776 offset = entry->offset + (start - entry->start);
3777 size = (end <= entry->end ? end : entry->end) - start;
3778 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
3779 vm_map_t smap;
3780 vm_map_entry_t tentry;
3781 vm_size_t tsize;
3782
3783 smap = entry->object.sub_map;
3784 vm_map_lock_read(smap);
3785 (void) vm_map_lookup_entry(smap, offset, &tentry);
3786 tsize = tentry->end - offset;
3787 if (tsize < size)
3788 size = tsize;
3789 object = tentry->object.vm_object;
3790 offset = tentry->offset + (offset - tentry->start);
3791 vm_map_unlock_read(smap);
3792 } else {
3793 object = entry->object.vm_object;
3794 }
3795 vm_object_reference(object);
3796 last_timestamp = map->timestamp;
3797 vm_map_unlock_read(map);
3798 if (!vm_object_sync(object, offset, size, syncio, invalidate))
3799 failed = TRUE;
3800 start += size;
3801 vm_object_deallocate(object);
3802 vm_map_lock_read(map);
3803 if (last_timestamp == map->timestamp ||
3804 !vm_map_lookup_entry(map, start, &entry))
3805 entry = vm_map_entry_succ(entry);
3806 }
3807
3808 vm_map_unlock_read(map);
3809 return (failed ? KERN_FAILURE : KERN_SUCCESS);
3810 }
3811
3812 /*
3813 * vm_map_entry_unwire: [ internal use only ]
3814 *
3815 * Make the region specified by this entry pageable.
3816 *
3817 * The map in question should be locked.
3818 * [This is the reason for this routine's existence.]
3819 */
3820 static void
3821 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
3822 {
3823 vm_size_t size;
3824
3825 VM_MAP_ASSERT_LOCKED(map);
3826 KASSERT(entry->wired_count > 0,
3827 ("vm_map_entry_unwire: entry %p isn't wired", entry));
3828
3829 size = entry->end - entry->start;
3830 if ((entry->eflags & MAP_ENTRY_USER_WIRED) != 0)
3831 vm_map_wire_user_count_sub(atop(size));
3832 pmap_unwire(map->pmap, entry->start, entry->end);
3833 vm_object_unwire(entry->object.vm_object, entry->offset, size,
3834 PQ_ACTIVE);
3835 entry->wired_count = 0;
3836 }
3837
3838 static void
3839 vm_map_entry_deallocate(vm_map_entry_t entry, boolean_t system_map)
3840 {
3841
3842 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0)
3843 vm_object_deallocate(entry->object.vm_object);
3844 uma_zfree(system_map ? kmapentzone : mapentzone, entry);
3845 }
3846
3847 /*
3848 * vm_map_entry_delete: [ internal use only ]
3849 *
3850 * Deallocate the given entry from the target map.
3851 */
3852 static void
3853 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
3854 {
3855 vm_object_t object;
3856 vm_pindex_t offidxstart, offidxend, size1;
3857 vm_size_t size;
3858
3859 vm_map_entry_unlink(map, entry, UNLINK_MERGE_NONE);
3860 object = entry->object.vm_object;
3861
3862 if ((entry->eflags & MAP_ENTRY_GUARD) != 0) {
3863 MPASS(entry->cred == NULL);
3864 MPASS((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0);
3865 MPASS(object == NULL);
3866 vm_map_entry_deallocate(entry, map->system_map);
3867 return;
3868 }
3869
3870 size = entry->end - entry->start;
3871 map->size -= size;
3872
3873 if (entry->cred != NULL) {
3874 swap_release_by_cred(size, entry->cred);
3875 crfree(entry->cred);
3876 }
3877
3878 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 || object == NULL) {
3879 entry->object.vm_object = NULL;
3880 } else if ((object->flags & OBJ_ANON) != 0 ||
3881 object == kernel_object) {
3882 KASSERT(entry->cred == NULL || object->cred == NULL ||
3883 (entry->eflags & MAP_ENTRY_NEEDS_COPY),
3884 ("OVERCOMMIT vm_map_entry_delete: both cred %p", entry));
3885 offidxstart = OFF_TO_IDX(entry->offset);
3886 offidxend = offidxstart + atop(size);
3887 VM_OBJECT_WLOCK(object);
3888 if (object->ref_count != 1 &&
3889 ((object->flags & OBJ_ONEMAPPING) != 0 ||
3890 object == kernel_object)) {
3891 vm_object_collapse(object);
3892
3893 /*
3894 * The option OBJPR_NOTMAPPED can be passed here
3895 * because vm_map_delete() already performed
3896 * pmap_remove() on the only mapping to this range
3897 * of pages.
3898 */
3899 vm_object_page_remove(object, offidxstart, offidxend,
3900 OBJPR_NOTMAPPED);
3901 if (offidxend >= object->size &&
3902 offidxstart < object->size) {
3903 size1 = object->size;
3904 object->size = offidxstart;
3905 if (object->cred != NULL) {
3906 size1 -= object->size;
3907 KASSERT(object->charge >= ptoa(size1),
3908 ("object %p charge < 0", object));
3909 swap_release_by_cred(ptoa(size1),
3910 object->cred);
3911 object->charge -= ptoa(size1);
3912 }
3913 }
3914 }
3915 VM_OBJECT_WUNLOCK(object);
3916 }
3917 if (map->system_map)
3918 vm_map_entry_deallocate(entry, TRUE);
3919 else {
3920 entry->defer_next = curthread->td_map_def_user;
3921 curthread->td_map_def_user = entry;
3922 }
3923 }
3924
3925 /*
3926 * vm_map_delete: [ internal use only ]
3927 *
3928 * Deallocates the given address range from the target
3929 * map.
3930 */
3931 int
3932 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
3933 {
3934 vm_map_entry_t entry, next_entry, scratch_entry;
3935 int rv;
3936
3937 VM_MAP_ASSERT_LOCKED(map);
3938
3939 if (start == end)
3940 return (KERN_SUCCESS);
3941
3942 /*
3943 * Find the start of the region, and clip it.
3944 * Step through all entries in this region.
3945 */
3946 rv = vm_map_lookup_clip_start(map, start, &entry, &scratch_entry);
3947 if (rv != KERN_SUCCESS)
3948 return (rv);
3949 for (; entry->start < end; entry = next_entry) {
3950 /*
3951 * Wait for wiring or unwiring of an entry to complete.
3952 * Also wait for any system wirings to disappear on
3953 * user maps.
3954 */
3955 if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
3956 (vm_map_pmap(map) != kernel_pmap &&
3957 vm_map_entry_system_wired_count(entry) != 0)) {
3958 unsigned int last_timestamp;
3959 vm_offset_t saved_start;
3960
3961 saved_start = entry->start;
3962 entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
3963 last_timestamp = map->timestamp;
3964 (void) vm_map_unlock_and_wait(map, 0);
3965 vm_map_lock(map);
3966 if (last_timestamp + 1 != map->timestamp) {
3967 /*
3968 * Look again for the entry because the map was
3969 * modified while it was unlocked.
3970 * Specifically, the entry may have been
3971 * clipped, merged, or deleted.
3972 */
3973 rv = vm_map_lookup_clip_start(map, saved_start,
3974 &next_entry, &scratch_entry);
3975 if (rv != KERN_SUCCESS)
3976 break;
3977 } else
3978 next_entry = entry;
3979 continue;
3980 }
3981
3982 /* XXXKIB or delete to the upper superpage boundary ? */
3983 rv = vm_map_clip_end(map, entry, end);
3984 if (rv != KERN_SUCCESS)
3985 break;
3986 next_entry = vm_map_entry_succ(entry);
3987
3988 /*
3989 * Unwire before removing addresses from the pmap; otherwise,
3990 * unwiring will put the entries back in the pmap.
3991 */
3992 if (entry->wired_count != 0)
3993 vm_map_entry_unwire(map, entry);
3994
3995 /*
3996 * Remove mappings for the pages, but only if the
3997 * mappings could exist. For instance, it does not
3998 * make sense to call pmap_remove() for guard entries.
3999 */
4000 if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0 ||
4001 entry->object.vm_object != NULL)
4002 pmap_remove(map->pmap, entry->start, entry->end);
4003
4004 if (entry->end == map->anon_loc)
4005 map->anon_loc = entry->start;
4006
4007 /*
4008 * Delete the entry only after removing all pmap
4009 * entries pointing to its pages. (Otherwise, its
4010 * page frames may be reallocated, and any modify bits
4011 * will be set in the wrong object!)
4012 */
4013 vm_map_entry_delete(map, entry);
4014 }
4015 return (rv);
4016 }
4017
4018 /*
4019 * vm_map_remove:
4020 *
4021 * Remove the given address range from the target map.
4022 * This is the exported form of vm_map_delete.
4023 */
4024 int
4025 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
4026 {
4027 int result;
4028
4029 vm_map_lock(map);
4030 VM_MAP_RANGE_CHECK(map, start, end);
4031 result = vm_map_delete(map, start, end);
4032 vm_map_unlock(map);
4033 return (result);
4034 }
4035
4036 /*
4037 * vm_map_check_protection:
4038 *
4039 * Assert that the target map allows the specified privilege on the
4040 * entire address region given. The entire region must be allocated.
4041 *
4042 * WARNING! This code does not and should not check whether the
4043 * contents of the region is accessible. For example a smaller file
4044 * might be mapped into a larger address space.
4045 *
4046 * NOTE! This code is also called by munmap().
4047 *
4048 * The map must be locked. A read lock is sufficient.
4049 */
4050 boolean_t
4051 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
4052 vm_prot_t protection)
4053 {
4054 vm_map_entry_t entry;
4055 vm_map_entry_t tmp_entry;
4056
4057 if (!vm_map_lookup_entry(map, start, &tmp_entry))
4058 return (FALSE);
4059 entry = tmp_entry;
4060
4061 while (start < end) {
4062 /*
4063 * No holes allowed!
4064 */
4065 if (start < entry->start)
4066 return (FALSE);
4067 /*
4068 * Check protection associated with entry.
4069 */
4070 if ((entry->protection & protection) != protection)
4071 return (FALSE);
4072 /* go to next entry */
4073 start = entry->end;
4074 entry = vm_map_entry_succ(entry);
4075 }
4076 return (TRUE);
4077 }
4078
4079 /*
4080 *
4081 * vm_map_copy_swap_object:
4082 *
4083 * Copies a swap-backed object from an existing map entry to a
4084 * new one. Carries forward the swap charge. May change the
4085 * src object on return.
4086 */
4087 static void
4088 vm_map_copy_swap_object(vm_map_entry_t src_entry, vm_map_entry_t dst_entry,
4089 vm_offset_t size, vm_ooffset_t *fork_charge)
4090 {
4091 vm_object_t src_object;
4092 struct ucred *cred;
4093 int charged;
4094
4095 src_object = src_entry->object.vm_object;
4096 charged = ENTRY_CHARGED(src_entry);
4097 if ((src_object->flags & OBJ_ANON) != 0) {
4098 VM_OBJECT_WLOCK(src_object);
4099 vm_object_collapse(src_object);
4100 if ((src_object->flags & OBJ_ONEMAPPING) != 0) {
4101 vm_object_split(src_entry);
4102 src_object = src_entry->object.vm_object;
4103 }
4104 vm_object_reference_locked(src_object);
4105 vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
4106 VM_OBJECT_WUNLOCK(src_object);
4107 } else
4108 vm_object_reference(src_object);
4109 if (src_entry->cred != NULL &&
4110 !(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4111 KASSERT(src_object->cred == NULL,
4112 ("OVERCOMMIT: vm_map_copy_anon_entry: cred %p",
4113 src_object));
4114 src_object->cred = src_entry->cred;
4115 src_object->charge = size;
4116 }
4117 dst_entry->object.vm_object = src_object;
4118 if (charged) {
4119 cred = curthread->td_ucred;
4120 crhold(cred);
4121 dst_entry->cred = cred;
4122 *fork_charge += size;
4123 if (!(src_entry->eflags & MAP_ENTRY_NEEDS_COPY)) {
4124 crhold(cred);
4125 src_entry->cred = cred;
4126 *fork_charge += size;
4127 }
4128 }
4129 }
4130
4131 /*
4132 * vm_map_copy_entry:
4133 *
4134 * Copies the contents of the source entry to the destination
4135 * entry. The entries *must* be aligned properly.
4136 */
4137 static void
4138 vm_map_copy_entry(
4139 vm_map_t src_map,
4140 vm_map_t dst_map,
4141 vm_map_entry_t src_entry,
4142 vm_map_entry_t dst_entry,
4143 vm_ooffset_t *fork_charge)
4144 {
4145 vm_object_t src_object;
4146 vm_map_entry_t fake_entry;
4147 vm_offset_t size;
4148
4149 VM_MAP_ASSERT_LOCKED(dst_map);
4150
4151 if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
4152 return;
4153
4154 if (src_entry->wired_count == 0 ||
4155 (src_entry->protection & VM_PROT_WRITE) == 0) {
4156 /*
4157 * If the source entry is marked needs_copy, it is already
4158 * write-protected.
4159 */
4160 if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0 &&
4161 (src_entry->protection & VM_PROT_WRITE) != 0) {
4162 pmap_protect(src_map->pmap,
4163 src_entry->start,
4164 src_entry->end,
4165 src_entry->protection & ~VM_PROT_WRITE);
4166 }
4167
4168 /*
4169 * Make a copy of the object.
4170 */
4171 size = src_entry->end - src_entry->start;
4172 if ((src_object = src_entry->object.vm_object) != NULL) {
4173 if (src_object->type == OBJT_DEFAULT ||
4174 src_object->type == OBJT_SWAP) {
4175 vm_map_copy_swap_object(src_entry, dst_entry,
4176 size, fork_charge);
4177 /* May have split/collapsed, reload obj. */
4178 src_object = src_entry->object.vm_object;
4179 } else {
4180 vm_object_reference(src_object);
4181 dst_entry->object.vm_object = src_object;
4182 }
4183 src_entry->eflags |= MAP_ENTRY_COW |
4184 MAP_ENTRY_NEEDS_COPY;
4185 dst_entry->eflags |= MAP_ENTRY_COW |
4186 MAP_ENTRY_NEEDS_COPY;
4187 dst_entry->offset = src_entry->offset;
4188 if (src_entry->eflags & MAP_ENTRY_WRITECNT) {
4189 /*
4190 * MAP_ENTRY_WRITECNT cannot
4191 * indicate write reference from
4192 * src_entry, since the entry is
4193 * marked as needs copy. Allocate a
4194 * fake entry that is used to
4195 * decrement object->un_pager writecount
4196 * at the appropriate time. Attach
4197 * fake_entry to the deferred list.
4198 */
4199 fake_entry = vm_map_entry_create(dst_map);
4200 fake_entry->eflags = MAP_ENTRY_WRITECNT;
4201 src_entry->eflags &= ~MAP_ENTRY_WRITECNT;
4202 vm_object_reference(src_object);
4203 fake_entry->object.vm_object = src_object;
4204 fake_entry->start = src_entry->start;
4205 fake_entry->end = src_entry->end;
4206 fake_entry->defer_next =
4207 curthread->td_map_def_user;
4208 curthread->td_map_def_user = fake_entry;
4209 }
4210
4211 pmap_copy(dst_map->pmap, src_map->pmap,
4212 dst_entry->start, dst_entry->end - dst_entry->start,
4213 src_entry->start);
4214 } else {
4215 dst_entry->object.vm_object = NULL;
4216 dst_entry->offset = 0;
4217 if (src_entry->cred != NULL) {
4218 dst_entry->cred = curthread->td_ucred;
4219 crhold(dst_entry->cred);
4220 *fork_charge += size;
4221 }
4222 }
4223 } else {
4224 /*
4225 * We don't want to make writeable wired pages copy-on-write.
4226 * Immediately copy these pages into the new map by simulating
4227 * page faults. The new pages are pageable.
4228 */
4229 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry,
4230 fork_charge);
4231 }
4232 }
4233
4234 /*
4235 * vmspace_map_entry_forked:
4236 * Update the newly-forked vmspace each time a map entry is inherited
4237 * or copied. The values for vm_dsize and vm_tsize are approximate
4238 * (and mostly-obsolete ideas in the face of mmap(2) et al.)
4239 */
4240 static void
4241 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
4242 vm_map_entry_t entry)
4243 {
4244 vm_size_t entrysize;
4245 vm_offset_t newend;
4246
4247 if ((entry->eflags & MAP_ENTRY_GUARD) != 0)
4248 return;
4249 entrysize = entry->end - entry->start;
4250 vm2->vm_map.size += entrysize;
4251 if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
4252 vm2->vm_ssize += btoc(entrysize);
4253 } else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
4254 entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
4255 newend = MIN(entry->end,
4256 (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
4257 vm2->vm_dsize += btoc(newend - entry->start);
4258 } else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
4259 entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
4260 newend = MIN(entry->end,
4261 (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
4262 vm2->vm_tsize += btoc(newend - entry->start);
4263 }
4264 }
4265
4266 /*
4267 * vmspace_fork:
4268 * Create a new process vmspace structure and vm_map
4269 * based on those of an existing process. The new map
4270 * is based on the old map, according to the inheritance
4271 * values on the regions in that map.
4272 *
4273 * XXX It might be worth coalescing the entries added to the new vmspace.
4274 *
4275 * The source map must not be locked.
4276 */
4277 struct vmspace *
4278 vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
4279 {
4280 struct vmspace *vm2;
4281 vm_map_t new_map, old_map;
4282 vm_map_entry_t new_entry, old_entry;
4283 vm_object_t object;
4284 int error, locked;
4285 vm_inherit_t inh;
4286
4287 old_map = &vm1->vm_map;
4288 /* Copy immutable fields of vm1 to vm2. */
4289 vm2 = vmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
4290 pmap_pinit);
4291 if (vm2 == NULL)
4292 return (NULL);
4293
4294 vm2->vm_taddr = vm1->vm_taddr;
4295 vm2->vm_daddr = vm1->vm_daddr;
4296 vm2->vm_maxsaddr = vm1->vm_maxsaddr;
4297 vm_map_lock(old_map);
4298 if (old_map->busy)
4299 vm_map_wait_busy(old_map);
4300 new_map = &vm2->vm_map;
4301 locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
4302 KASSERT(locked, ("vmspace_fork: lock failed"));
4303
4304 error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
4305 if (error != 0) {
4306 sx_xunlock(&old_map->lock);
4307 sx_xunlock(&new_map->lock);
4308 vm_map_process_deferred();
4309 vmspace_free(vm2);
4310 return (NULL);
4311 }
4312
4313 new_map->anon_loc = old_map->anon_loc;
4314 new_map->flags |= old_map->flags & (MAP_ASLR | MAP_ASLR_IGNSTART |
4315 MAP_WXORX);
4316
4317 VM_MAP_ENTRY_FOREACH(old_entry, old_map) {
4318 if ((old_entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
4319 panic("vm_map_fork: encountered a submap");
4320
4321 inh = old_entry->inheritance;
4322 if ((old_entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4323 inh != VM_INHERIT_NONE)
4324 inh = VM_INHERIT_COPY;
4325
4326 switch (inh) {
4327 case VM_INHERIT_NONE:
4328 break;
4329
4330 case VM_INHERIT_SHARE:
4331 /*
4332 * Clone the entry, creating the shared object if
4333 * necessary.
4334 */
4335 object = old_entry->object.vm_object;
4336 if (object == NULL) {
4337 vm_map_entry_back(old_entry);
4338 object = old_entry->object.vm_object;
4339 }
4340
4341 /*
4342 * Add the reference before calling vm_object_shadow
4343 * to insure that a shadow object is created.
4344 */
4345 vm_object_reference(object);
4346 if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
4347 vm_object_shadow(&old_entry->object.vm_object,
4348 &old_entry->offset,
4349 old_entry->end - old_entry->start,
4350 old_entry->cred,
4351 /* Transfer the second reference too. */
4352 true);
4353 old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
4354 old_entry->cred = NULL;
4355
4356 /*
4357 * As in vm_map_merged_neighbor_dispose(),
4358 * the vnode lock will not be acquired in
4359 * this call to vm_object_deallocate().
4360 */
4361 vm_object_deallocate(object);
4362 object = old_entry->object.vm_object;
4363 } else {
4364 VM_OBJECT_WLOCK(object);
4365 vm_object_clear_flag(object, OBJ_ONEMAPPING);
4366 if (old_entry->cred != NULL) {
4367 KASSERT(object->cred == NULL,
4368 ("vmspace_fork both cred"));
4369 object->cred = old_entry->cred;
4370 object->charge = old_entry->end -
4371 old_entry->start;
4372 old_entry->cred = NULL;
4373 }
4374
4375 /*
4376 * Assert the correct state of the vnode
4377 * v_writecount while the object is locked, to
4378 * not relock it later for the assertion
4379 * correctness.
4380 */
4381 if (old_entry->eflags & MAP_ENTRY_WRITECNT &&
4382 object->type == OBJT_VNODE) {
4383 KASSERT(((struct vnode *)object->
4384 handle)->v_writecount > 0,
4385 ("vmspace_fork: v_writecount %p",
4386 object));
4387 KASSERT(object->un_pager.vnp.
4388 writemappings > 0,
4389 ("vmspace_fork: vnp.writecount %p",
4390 object));
4391 }
4392 VM_OBJECT_WUNLOCK(object);
4393 }
4394
4395 /*
4396 * Clone the entry, referencing the shared object.
4397 */
4398 new_entry = vm_map_entry_create(new_map);
4399 *new_entry = *old_entry;
4400 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4401 MAP_ENTRY_IN_TRANSITION);
4402 new_entry->wiring_thread = NULL;
4403 new_entry->wired_count = 0;
4404 if (new_entry->eflags & MAP_ENTRY_WRITECNT) {
4405 vm_pager_update_writecount(object,
4406 new_entry->start, new_entry->end);
4407 }
4408 vm_map_entry_set_vnode_text(new_entry, true);
4409
4410 /*
4411 * Insert the entry into the new map -- we know we're
4412 * inserting at the end of the new map.
4413 */
4414 vm_map_entry_link(new_map, new_entry);
4415 vmspace_map_entry_forked(vm1, vm2, new_entry);
4416
4417 /*
4418 * Update the physical map
4419 */
4420 pmap_copy(new_map->pmap, old_map->pmap,
4421 new_entry->start,
4422 (old_entry->end - old_entry->start),
4423 old_entry->start);
4424 break;
4425
4426 case VM_INHERIT_COPY:
4427 /*
4428 * Clone the entry and link into the map.
4429 */
4430 new_entry = vm_map_entry_create(new_map);
4431 *new_entry = *old_entry;
4432 /*
4433 * Copied entry is COW over the old object.
4434 */
4435 new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
4436 MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_WRITECNT);
4437 new_entry->wiring_thread = NULL;
4438 new_entry->wired_count = 0;
4439 new_entry->object.vm_object = NULL;
4440 new_entry->cred = NULL;
4441 vm_map_entry_link(new_map, new_entry);
4442 vmspace_map_entry_forked(vm1, vm2, new_entry);
4443 vm_map_copy_entry(old_map, new_map, old_entry,
4444 new_entry, fork_charge);
4445 vm_map_entry_set_vnode_text(new_entry, true);
4446 break;
4447
4448 case VM_INHERIT_ZERO:
4449 /*
4450 * Create a new anonymous mapping entry modelled from
4451 * the old one.
4452 */
4453 new_entry = vm_map_entry_create(new_map);
4454 memset(new_entry, 0, sizeof(*new_entry));
4455
4456 new_entry->start = old_entry->start;
4457 new_entry->end = old_entry->end;
4458 new_entry->eflags = old_entry->eflags &
4459 ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION |
4460 MAP_ENTRY_WRITECNT | MAP_ENTRY_VN_EXEC |
4461 MAP_ENTRY_SPLIT_BOUNDARY_MASK);
4462 new_entry->protection = old_entry->protection;
4463 new_entry->max_protection = old_entry->max_protection;
4464 new_entry->inheritance = VM_INHERIT_ZERO;
4465
4466 vm_map_entry_link(new_map, new_entry);
4467 vmspace_map_entry_forked(vm1, vm2, new_entry);
4468
4469 new_entry->cred = curthread->td_ucred;
4470 crhold(new_entry->cred);
4471 *fork_charge += (new_entry->end - new_entry->start);
4472
4473 break;
4474 }
4475 }
4476 /*
4477 * Use inlined vm_map_unlock() to postpone handling the deferred
4478 * map entries, which cannot be done until both old_map and
4479 * new_map locks are released.
4480 */
4481 sx_xunlock(&old_map->lock);
4482 sx_xunlock(&new_map->lock);
4483 vm_map_process_deferred();
4484
4485 return (vm2);
4486 }
4487
4488 /*
4489 * Create a process's stack for exec_new_vmspace(). This function is never
4490 * asked to wire the newly created stack.
4491 */
4492 int
4493 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4494 vm_prot_t prot, vm_prot_t max, int cow)
4495 {
4496 vm_size_t growsize, init_ssize;
4497 rlim_t vmemlim;
4498 int rv;
4499
4500 MPASS((map->flags & MAP_WIREFUTURE) == 0);
4501 growsize = sgrowsiz;
4502 init_ssize = (max_ssize < growsize) ? max_ssize : growsize;
4503 vm_map_lock(map);
4504 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4505 /* If we would blow our VMEM resource limit, no go */
4506 if (map->size + init_ssize > vmemlim) {
4507 rv = KERN_NO_SPACE;
4508 goto out;
4509 }
4510 rv = vm_map_stack_locked(map, addrbos, max_ssize, growsize, prot,
4511 max, cow);
4512 out:
4513 vm_map_unlock(map);
4514 return (rv);
4515 }
4516
4517 static int stack_guard_page = 1;
4518 SYSCTL_INT(_security_bsd, OID_AUTO, stack_guard_page, CTLFLAG_RWTUN,
4519 &stack_guard_page, 0,
4520 "Specifies the number of guard pages for a stack that grows");
4521
4522 static int
4523 vm_map_stack_locked(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
4524 vm_size_t growsize, vm_prot_t prot, vm_prot_t max, int cow)
4525 {
4526 vm_map_entry_t new_entry, prev_entry;
4527 vm_offset_t bot, gap_bot, gap_top, top;
4528 vm_size_t init_ssize, sgp;
4529 int orient, rv;
4530
4531 /*
4532 * The stack orientation is piggybacked with the cow argument.
4533 * Extract it into orient and mask the cow argument so that we
4534 * don't pass it around further.
4535 */
4536 orient = cow & (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP);
4537 KASSERT(orient != 0, ("No stack grow direction"));
4538 KASSERT(orient != (MAP_STACK_GROWS_DOWN | MAP_STACK_GROWS_UP),
4539 ("bi-dir stack"));
4540
4541 if (max_ssize == 0 ||
4542 !vm_map_range_valid(map, addrbos, addrbos + max_ssize))
4543 return (KERN_INVALID_ADDRESS);
4544 sgp = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4545 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4546 (vm_size_t)stack_guard_page * PAGE_SIZE;
4547 if (sgp >= max_ssize)
4548 return (KERN_INVALID_ARGUMENT);
4549
4550 init_ssize = growsize;
4551 if (max_ssize < init_ssize + sgp)
4552 init_ssize = max_ssize - sgp;
4553
4554 /* If addr is already mapped, no go */
4555 if (vm_map_lookup_entry(map, addrbos, &prev_entry))
4556 return (KERN_NO_SPACE);
4557
4558 /*
4559 * If we can't accommodate max_ssize in the current mapping, no go.
4560 */
4561 if (vm_map_entry_succ(prev_entry)->start < addrbos + max_ssize)
4562 return (KERN_NO_SPACE);
4563
4564 /*
4565 * We initially map a stack of only init_ssize. We will grow as
4566 * needed later. Depending on the orientation of the stack (i.e.
4567 * the grow direction) we either map at the top of the range, the
4568 * bottom of the range or in the middle.
4569 *
4570 * Note: we would normally expect prot and max to be VM_PROT_ALL,
4571 * and cow to be 0. Possibly we should eliminate these as input
4572 * parameters, and just pass these values here in the insert call.
4573 */
4574 if (orient == MAP_STACK_GROWS_DOWN) {
4575 bot = addrbos + max_ssize - init_ssize;
4576 top = bot + init_ssize;
4577 gap_bot = addrbos;
4578 gap_top = bot;
4579 } else /* if (orient == MAP_STACK_GROWS_UP) */ {
4580 bot = addrbos;
4581 top = bot + init_ssize;
4582 gap_bot = top;
4583 gap_top = addrbos + max_ssize;
4584 }
4585 rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
4586 if (rv != KERN_SUCCESS)
4587 return (rv);
4588 new_entry = vm_map_entry_succ(prev_entry);
4589 KASSERT(new_entry->end == top || new_entry->start == bot,
4590 ("Bad entry start/end for new stack entry"));
4591 KASSERT((orient & MAP_STACK_GROWS_DOWN) == 0 ||
4592 (new_entry->eflags & MAP_ENTRY_GROWS_DOWN) != 0,
4593 ("new entry lacks MAP_ENTRY_GROWS_DOWN"));
4594 KASSERT((orient & MAP_STACK_GROWS_UP) == 0 ||
4595 (new_entry->eflags & MAP_ENTRY_GROWS_UP) != 0,
4596 ("new entry lacks MAP_ENTRY_GROWS_UP"));
4597 if (gap_bot == gap_top)
4598 return (KERN_SUCCESS);
4599 rv = vm_map_insert(map, NULL, 0, gap_bot, gap_top, VM_PROT_NONE,
4600 VM_PROT_NONE, MAP_CREATE_GUARD | (orient == MAP_STACK_GROWS_DOWN ?
4601 MAP_CREATE_STACK_GAP_DN : MAP_CREATE_STACK_GAP_UP));
4602 if (rv == KERN_SUCCESS) {
4603 /*
4604 * Gap can never successfully handle a fault, so
4605 * read-ahead logic is never used for it. Re-use
4606 * next_read of the gap entry to store
4607 * stack_guard_page for vm_map_growstack().
4608 */
4609 if (orient == MAP_STACK_GROWS_DOWN)
4610 vm_map_entry_pred(new_entry)->next_read = sgp;
4611 else
4612 vm_map_entry_succ(new_entry)->next_read = sgp;
4613 } else {
4614 (void)vm_map_delete(map, bot, top);
4615 }
4616 return (rv);
4617 }
4618
4619 /*
4620 * Attempts to grow a vm stack entry. Returns KERN_SUCCESS if we
4621 * successfully grow the stack.
4622 */
4623 static int
4624 vm_map_growstack(vm_map_t map, vm_offset_t addr, vm_map_entry_t gap_entry)
4625 {
4626 vm_map_entry_t stack_entry;
4627 struct proc *p;
4628 struct vmspace *vm;
4629 struct ucred *cred;
4630 vm_offset_t gap_end, gap_start, grow_start;
4631 vm_size_t grow_amount, guard, max_grow;
4632 rlim_t lmemlim, stacklim, vmemlim;
4633 int rv, rv1;
4634 bool gap_deleted, grow_down, is_procstack;
4635 #ifdef notyet
4636 uint64_t limit;
4637 #endif
4638 #ifdef RACCT
4639 int error;
4640 #endif
4641
4642 p = curproc;
4643 vm = p->p_vmspace;
4644
4645 /*
4646 * Disallow stack growth when the access is performed by a
4647 * debugger or AIO daemon. The reason is that the wrong
4648 * resource limits are applied.
4649 */
4650 if (p != initproc && (map != &p->p_vmspace->vm_map ||
4651 p->p_textvp == NULL))
4652 return (KERN_FAILURE);
4653
4654 MPASS(!map->system_map);
4655
4656 lmemlim = lim_cur(curthread, RLIMIT_MEMLOCK);
4657 stacklim = lim_cur(curthread, RLIMIT_STACK);
4658 vmemlim = lim_cur(curthread, RLIMIT_VMEM);
4659 retry:
4660 /* If addr is not in a hole for a stack grow area, no need to grow. */
4661 if (gap_entry == NULL && !vm_map_lookup_entry(map, addr, &gap_entry))
4662 return (KERN_FAILURE);
4663 if ((gap_entry->eflags & MAP_ENTRY_GUARD) == 0)
4664 return (KERN_SUCCESS);
4665 if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_DN) != 0) {
4666 stack_entry = vm_map_entry_succ(gap_entry);
4667 if ((stack_entry->eflags & MAP_ENTRY_GROWS_DOWN) == 0 ||
4668 stack_entry->start != gap_entry->end)
4669 return (KERN_FAILURE);
4670 grow_amount = round_page(stack_entry->start - addr);
4671 grow_down = true;
4672 } else if ((gap_entry->eflags & MAP_ENTRY_STACK_GAP_UP) != 0) {
4673 stack_entry = vm_map_entry_pred(gap_entry);
4674 if ((stack_entry->eflags & MAP_ENTRY_GROWS_UP) == 0 ||
4675 stack_entry->end != gap_entry->start)
4676 return (KERN_FAILURE);
4677 grow_amount = round_page(addr + 1 - stack_entry->end);
4678 grow_down = false;
4679 } else {
4680 return (KERN_FAILURE);
4681 }
4682 guard = ((curproc->p_flag2 & P2_STKGAP_DISABLE) != 0 ||
4683 (curproc->p_fctl0 & NT_FREEBSD_FCTL_STKGAP_DISABLE) != 0) ? 0 :
4684 gap_entry->next_read;
4685 max_grow = gap_entry->end - gap_entry->start;
4686 if (guard > max_grow)
4687 return (KERN_NO_SPACE);
4688 max_grow -= guard;
4689 if (grow_amount > max_grow)
4690 return (KERN_NO_SPACE);
4691
4692 /*
4693 * If this is the main process stack, see if we're over the stack
4694 * limit.
4695 */
4696 is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr &&
4697 addr < (vm_offset_t)p->p_sysent->sv_usrstack;
4698 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim))
4699 return (KERN_NO_SPACE);
4700
4701 #ifdef RACCT
4702 if (racct_enable) {
4703 PROC_LOCK(p);
4704 if (is_procstack && racct_set(p, RACCT_STACK,
4705 ctob(vm->vm_ssize) + grow_amount)) {
4706 PROC_UNLOCK(p);
4707 return (KERN_NO_SPACE);
4708 }
4709 PROC_UNLOCK(p);
4710 }
4711 #endif
4712
4713 grow_amount = roundup(grow_amount, sgrowsiz);
4714 if (grow_amount > max_grow)
4715 grow_amount = max_grow;
4716 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
4717 grow_amount = trunc_page((vm_size_t)stacklim) -
4718 ctob(vm->vm_ssize);
4719 }
4720
4721 #ifdef notyet
4722 PROC_LOCK(p);
4723 limit = racct_get_available(p, RACCT_STACK);
4724 PROC_UNLOCK(p);
4725 if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > limit))
4726 grow_amount = limit - ctob(vm->vm_ssize);
4727 #endif
4728
4729 if (!old_mlock && (map->flags & MAP_WIREFUTURE) != 0) {
4730 if (ptoa(pmap_wired_count(map->pmap)) + grow_amount > lmemlim) {
4731 rv = KERN_NO_SPACE;
4732 goto out;
4733 }
4734 #ifdef RACCT
4735 if (racct_enable) {
4736 PROC_LOCK(p);
4737 if (racct_set(p, RACCT_MEMLOCK,
4738 ptoa(pmap_wired_count(map->pmap)) + grow_amount)) {
4739 PROC_UNLOCK(p);
4740 rv = KERN_NO_SPACE;
4741 goto out;
4742 }
4743 PROC_UNLOCK(p);
4744 }
4745 #endif
4746 }
4747
4748 /* If we would blow our VMEM resource limit, no go */
4749 if (map->size + grow_amount > vmemlim) {
4750 rv = KERN_NO_SPACE;
4751 goto out;
4752 }
4753 #ifdef RACCT
4754 if (racct_enable) {
4755 PROC_LOCK(p);
4756 if (racct_set(p, RACCT_VMEM, map->size + grow_amount)) {
4757 PROC_UNLOCK(p);
4758 rv = KERN_NO_SPACE;
4759 goto out;
4760 }
4761 PROC_UNLOCK(p);
4762 }
4763 #endif
4764
4765 if (vm_map_lock_upgrade(map)) {
4766 gap_entry = NULL;
4767 vm_map_lock_read(map);
4768 goto retry;
4769 }
4770
4771 if (grow_down) {
4772 grow_start = gap_entry->end - grow_amount;
4773 if (gap_entry->start + grow_amount == gap_entry->end) {
4774 gap_start = gap_entry->start;
4775 gap_end = gap_entry->end;
4776 vm_map_entry_delete(map, gap_entry);
4777 gap_deleted = true;
4778 } else {
4779 MPASS(gap_entry->start < gap_entry->end - grow_amount);
4780 vm_map_entry_resize(map, gap_entry, -grow_amount);
4781 gap_deleted = false;
4782 }
4783 rv = vm_map_insert(map, NULL, 0, grow_start,
4784 grow_start + grow_amount,
4785 stack_entry->protection, stack_entry->max_protection,
4786 MAP_STACK_GROWS_DOWN);
4787 if (rv != KERN_SUCCESS) {
4788 if (gap_deleted) {
4789 rv1 = vm_map_insert(map, NULL, 0, gap_start,
4790 gap_end, VM_PROT_NONE, VM_PROT_NONE,
4791 MAP_CREATE_GUARD | MAP_CREATE_STACK_GAP_DN);
4792 MPASS(rv1 == KERN_SUCCESS);
4793 } else
4794 vm_map_entry_resize(map, gap_entry,
4795 grow_amount);
4796 }
4797 } else {
4798 grow_start = stack_entry->end;
4799 cred = stack_entry->cred;
4800 if (cred == NULL && stack_entry->object.vm_object != NULL)
4801 cred = stack_entry->object.vm_object->cred;
4802 if (cred != NULL && !swap_reserve_by_cred(grow_amount, cred))
4803 rv = KERN_NO_SPACE;
4804 /* Grow the underlying object if applicable. */
4805 else if (stack_entry->object.vm_object == NULL ||
4806 vm_object_coalesce(stack_entry->object.vm_object,
4807 stack_entry->offset,
4808 (vm_size_t)(stack_entry->end - stack_entry->start),
4809 grow_amount, cred != NULL)) {
4810 if (gap_entry->start + grow_amount == gap_entry->end) {
4811 vm_map_entry_delete(map, gap_entry);
4812 vm_map_entry_resize(map, stack_entry,
4813 grow_amount);
4814 } else {
4815 gap_entry->start += grow_amount;
4816 stack_entry->end += grow_amount;
4817 }
4818 map->size += grow_amount;
4819 rv = KERN_SUCCESS;
4820 } else
4821 rv = KERN_FAILURE;
4822 }
4823 if (rv == KERN_SUCCESS && is_procstack)
4824 vm->vm_ssize += btoc(grow_amount);
4825
4826 /*
4827 * Heed the MAP_WIREFUTURE flag if it was set for this process.
4828 */
4829 if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE) != 0) {
4830 rv = vm_map_wire_locked(map, grow_start,
4831 grow_start + grow_amount,
4832 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
4833 }
4834 vm_map_lock_downgrade(map);
4835
4836 out:
4837 #ifdef RACCT
4838 if (racct_enable && rv != KERN_SUCCESS) {
4839 PROC_LOCK(p);
4840 error = racct_set(p, RACCT_VMEM, map->size);
4841 KASSERT(error == 0, ("decreasing RACCT_VMEM failed"));
4842 if (!old_mlock) {
4843 error = racct_set(p, RACCT_MEMLOCK,
4844 ptoa(pmap_wired_count(map->pmap)));
4845 KASSERT(error == 0, ("decreasing RACCT_MEMLOCK failed"));
4846 }
4847 error = racct_set(p, RACCT_STACK, ctob(vm->vm_ssize));
4848 KASSERT(error == 0, ("decreasing RACCT_STACK failed"));
4849 PROC_UNLOCK(p);
4850 }
4851 #endif
4852
4853 return (rv);
4854 }
4855
4856 /*
4857 * Unshare the specified VM space for exec. If other processes are
4858 * mapped to it, then create a new one. The new vmspace is null.
4859 */
4860 int
4861 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
4862 {
4863 struct vmspace *oldvmspace = p->p_vmspace;
4864 struct vmspace *newvmspace;
4865
4866 KASSERT((curthread->td_pflags & TDP_EXECVMSPC) == 0,
4867 ("vmspace_exec recursed"));
4868 newvmspace = vmspace_alloc(minuser, maxuser, pmap_pinit);
4869 if (newvmspace == NULL)
4870 return (ENOMEM);
4871 newvmspace->vm_swrss = oldvmspace->vm_swrss;
4872 /*
4873 * This code is written like this for prototype purposes. The
4874 * goal is to avoid running down the vmspace here, but let the
4875 * other process's that are still using the vmspace to finally
4876 * run it down. Even though there is little or no chance of blocking
4877 * here, it is a good idea to keep this form for future mods.
4878 */
4879 PROC_VMSPACE_LOCK(p);
4880 p->p_vmspace = newvmspace;
4881 PROC_VMSPACE_UNLOCK(p);
4882 if (p == curthread->td_proc)
4883 pmap_activate(curthread);
4884 curthread->td_pflags |= TDP_EXECVMSPC;
4885 return (0);
4886 }
4887
4888 /*
4889 * Unshare the specified VM space for forcing COW. This
4890 * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
4891 */
4892 int
4893 vmspace_unshare(struct proc *p)
4894 {
4895 struct vmspace *oldvmspace = p->p_vmspace;
4896 struct vmspace *newvmspace;
4897 vm_ooffset_t fork_charge;
4898
4899 if (refcount_load(&oldvmspace->vm_refcnt) == 1)
4900 return (0);
4901 fork_charge = 0;
4902 newvmspace = vmspace_fork(oldvmspace, &fork_charge);
4903 if (newvmspace == NULL)
4904 return (ENOMEM);
4905 if (!swap_reserve_by_cred(fork_charge, p->p_ucred)) {
4906 vmspace_free(newvmspace);
4907 return (ENOMEM);
4908 }
4909 PROC_VMSPACE_LOCK(p);
4910 p->p_vmspace = newvmspace;
4911 PROC_VMSPACE_UNLOCK(p);
4912 if (p == curthread->td_proc)
4913 pmap_activate(curthread);
4914 vmspace_free(oldvmspace);
4915 return (0);
4916 }
4917
4918 /*
4919 * vm_map_lookup:
4920 *
4921 * Finds the VM object, offset, and
4922 * protection for a given virtual address in the
4923 * specified map, assuming a page fault of the
4924 * type specified.
4925 *
4926 * Leaves the map in question locked for read; return
4927 * values are guaranteed until a vm_map_lookup_done
4928 * call is performed. Note that the map argument
4929 * is in/out; the returned map must be used in
4930 * the call to vm_map_lookup_done.
4931 *
4932 * A handle (out_entry) is returned for use in
4933 * vm_map_lookup_done, to make that fast.
4934 *
4935 * If a lookup is requested with "write protection"
4936 * specified, the map may be changed to perform virtual
4937 * copying operations, although the data referenced will
4938 * remain the same.
4939 */
4940 int
4941 vm_map_lookup(vm_map_t *var_map, /* IN/OUT */
4942 vm_offset_t vaddr,
4943 vm_prot_t fault_typea,
4944 vm_map_entry_t *out_entry, /* OUT */
4945 vm_object_t *object, /* OUT */
4946 vm_pindex_t *pindex, /* OUT */
4947 vm_prot_t *out_prot, /* OUT */
4948 boolean_t *wired) /* OUT */
4949 {
4950 vm_map_entry_t entry;
4951 vm_map_t map = *var_map;
4952 vm_prot_t prot;
4953 vm_prot_t fault_type;
4954 vm_object_t eobject;
4955 vm_size_t size;
4956 struct ucred *cred;
4957
4958 RetryLookup:
4959
4960 vm_map_lock_read(map);
4961
4962 RetryLookupLocked:
4963 /*
4964 * Lookup the faulting address.
4965 */
4966 if (!vm_map_lookup_entry(map, vaddr, out_entry)) {
4967 vm_map_unlock_read(map);
4968 return (KERN_INVALID_ADDRESS);
4969 }
4970
4971 entry = *out_entry;
4972
4973 /*
4974 * Handle submaps.
4975 */
4976 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
4977 vm_map_t old_map = map;
4978
4979 *var_map = map = entry->object.sub_map;
4980 vm_map_unlock_read(old_map);
4981 goto RetryLookup;
4982 }
4983
4984 /*
4985 * Check whether this task is allowed to have this page.
4986 */
4987 prot = entry->protection;
4988 if ((fault_typea & VM_PROT_FAULT_LOOKUP) != 0) {
4989 fault_typea &= ~VM_PROT_FAULT_LOOKUP;
4990 if (prot == VM_PROT_NONE && map != kernel_map &&
4991 (entry->eflags & MAP_ENTRY_GUARD) != 0 &&
4992 (entry->eflags & (MAP_ENTRY_STACK_GAP_DN |
4993 MAP_ENTRY_STACK_GAP_UP)) != 0 &&
4994 vm_map_growstack(map, vaddr, entry) == KERN_SUCCESS)
4995 goto RetryLookupLocked;
4996 }
4997 fault_type = fault_typea & VM_PROT_ALL;
4998 if ((fault_type & prot) != fault_type || prot == VM_PROT_NONE) {
4999 vm_map_unlock_read(map);
5000 return (KERN_PROTECTION_FAILURE);
5001 }
5002 KASSERT((prot & VM_PROT_WRITE) == 0 || (entry->eflags &
5003 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY)) !=
5004 (MAP_ENTRY_USER_WIRED | MAP_ENTRY_NEEDS_COPY),
5005 ("entry %p flags %x", entry, entry->eflags));
5006 if ((fault_typea & VM_PROT_COPY) != 0 &&
5007 (entry->max_protection & VM_PROT_WRITE) == 0 &&
5008 (entry->eflags & MAP_ENTRY_COW) == 0) {
5009 vm_map_unlock_read(map);
5010 return (KERN_PROTECTION_FAILURE);
5011 }
5012
5013 /*
5014 * If this page is not pageable, we have to get it for all possible
5015 * accesses.
5016 */
5017 *wired = (entry->wired_count != 0);
5018 if (*wired)
5019 fault_type = entry->protection;
5020 size = entry->end - entry->start;
5021
5022 /*
5023 * If the entry was copy-on-write, we either ...
5024 */
5025 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5026 /*
5027 * If we want to write the page, we may as well handle that
5028 * now since we've got the map locked.
5029 *
5030 * If we don't need to write the page, we just demote the
5031 * permissions allowed.
5032 */
5033 if ((fault_type & VM_PROT_WRITE) != 0 ||
5034 (fault_typea & VM_PROT_COPY) != 0) {
5035 /*
5036 * Make a new object, and place it in the object
5037 * chain. Note that no new references have appeared
5038 * -- one just moved from the map to the new
5039 * object.
5040 */
5041 if (vm_map_lock_upgrade(map))
5042 goto RetryLookup;
5043
5044 if (entry->cred == NULL) {
5045 /*
5046 * The debugger owner is charged for
5047 * the memory.
5048 */
5049 cred = curthread->td_ucred;
5050 crhold(cred);
5051 if (!swap_reserve_by_cred(size, cred)) {
5052 crfree(cred);
5053 vm_map_unlock(map);
5054 return (KERN_RESOURCE_SHORTAGE);
5055 }
5056 entry->cred = cred;
5057 }
5058 eobject = entry->object.vm_object;
5059 vm_object_shadow(&entry->object.vm_object,
5060 &entry->offset, size, entry->cred, false);
5061 if (eobject == entry->object.vm_object) {
5062 /*
5063 * The object was not shadowed.
5064 */
5065 swap_release_by_cred(size, entry->cred);
5066 crfree(entry->cred);
5067 }
5068 entry->cred = NULL;
5069 entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
5070
5071 vm_map_lock_downgrade(map);
5072 } else {
5073 /*
5074 * We're attempting to read a copy-on-write page --
5075 * don't allow writes.
5076 */
5077 prot &= ~VM_PROT_WRITE;
5078 }
5079 }
5080
5081 /*
5082 * Create an object if necessary.
5083 */
5084 if (entry->object.vm_object == NULL && !map->system_map) {
5085 if (vm_map_lock_upgrade(map))
5086 goto RetryLookup;
5087 entry->object.vm_object = vm_object_allocate_anon(atop(size),
5088 NULL, entry->cred, entry->cred != NULL ? size : 0);
5089 entry->offset = 0;
5090 entry->cred = NULL;
5091 vm_map_lock_downgrade(map);
5092 }
5093
5094 /*
5095 * Return the object/offset from this entry. If the entry was
5096 * copy-on-write or empty, it has been fixed up.
5097 */
5098 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5099 *object = entry->object.vm_object;
5100
5101 *out_prot = prot;
5102 return (KERN_SUCCESS);
5103 }
5104
5105 /*
5106 * vm_map_lookup_locked:
5107 *
5108 * Lookup the faulting address. A version of vm_map_lookup that returns
5109 * KERN_FAILURE instead of blocking on map lock or memory allocation.
5110 */
5111 int
5112 vm_map_lookup_locked(vm_map_t *var_map, /* IN/OUT */
5113 vm_offset_t vaddr,
5114 vm_prot_t fault_typea,
5115 vm_map_entry_t *out_entry, /* OUT */
5116 vm_object_t *object, /* OUT */
5117 vm_pindex_t *pindex, /* OUT */
5118 vm_prot_t *out_prot, /* OUT */
5119 boolean_t *wired) /* OUT */
5120 {
5121 vm_map_entry_t entry;
5122 vm_map_t map = *var_map;
5123 vm_prot_t prot;
5124 vm_prot_t fault_type = fault_typea;
5125
5126 /*
5127 * Lookup the faulting address.
5128 */
5129 if (!vm_map_lookup_entry(map, vaddr, out_entry))
5130 return (KERN_INVALID_ADDRESS);
5131
5132 entry = *out_entry;
5133
5134 /*
5135 * Fail if the entry refers to a submap.
5136 */
5137 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
5138 return (KERN_FAILURE);
5139
5140 /*
5141 * Check whether this task is allowed to have this page.
5142 */
5143 prot = entry->protection;
5144 fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
5145 if ((fault_type & prot) != fault_type)
5146 return (KERN_PROTECTION_FAILURE);
5147
5148 /*
5149 * If this page is not pageable, we have to get it for all possible
5150 * accesses.
5151 */
5152 *wired = (entry->wired_count != 0);
5153 if (*wired)
5154 fault_type = entry->protection;
5155
5156 if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
5157 /*
5158 * Fail if the entry was copy-on-write for a write fault.
5159 */
5160 if (fault_type & VM_PROT_WRITE)
5161 return (KERN_FAILURE);
5162 /*
5163 * We're attempting to read a copy-on-write page --
5164 * don't allow writes.
5165 */
5166 prot &= ~VM_PROT_WRITE;
5167 }
5168
5169 /*
5170 * Fail if an object should be created.
5171 */
5172 if (entry->object.vm_object == NULL && !map->system_map)
5173 return (KERN_FAILURE);
5174
5175 /*
5176 * Return the object/offset from this entry. If the entry was
5177 * copy-on-write or empty, it has been fixed up.
5178 */
5179 *pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
5180 *object = entry->object.vm_object;
5181
5182 *out_prot = prot;
5183 return (KERN_SUCCESS);
5184 }
5185
5186 /*
5187 * vm_map_lookup_done:
5188 *
5189 * Releases locks acquired by a vm_map_lookup
5190 * (according to the handle returned by that lookup).
5191 */
5192 void
5193 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
5194 {
5195 /*
5196 * Unlock the main-level map
5197 */
5198 vm_map_unlock_read(map);
5199 }
5200
5201 vm_offset_t
5202 vm_map_max_KBI(const struct vm_map *map)
5203 {
5204
5205 return (vm_map_max(map));
5206 }
5207
5208 vm_offset_t
5209 vm_map_min_KBI(const struct vm_map *map)
5210 {
5211
5212 return (vm_map_min(map));
5213 }
5214
5215 pmap_t
5216 vm_map_pmap_KBI(vm_map_t map)
5217 {
5218
5219 return (map->pmap);
5220 }
5221
5222 bool
5223 vm_map_range_valid_KBI(vm_map_t map, vm_offset_t start, vm_offset_t end)
5224 {
5225
5226 return (vm_map_range_valid(map, start, end));
5227 }
5228
5229 #ifdef INVARIANTS
5230 static void
5231 _vm_map_assert_consistent(vm_map_t map, int check)
5232 {
5233 vm_map_entry_t entry, prev;
5234 vm_map_entry_t cur, header, lbound, ubound;
5235 vm_size_t max_left, max_right;
5236
5237 #ifdef DIAGNOSTIC
5238 ++map->nupdates;
5239 #endif
5240 if (enable_vmmap_check != check)
5241 return;
5242
5243 header = prev = &map->header;
5244 VM_MAP_ENTRY_FOREACH(entry, map) {
5245 KASSERT(prev->end <= entry->start,
5246 ("map %p prev->end = %jx, start = %jx", map,
5247 (uintmax_t)prev->end, (uintmax_t)entry->start));
5248 KASSERT(entry->start < entry->end,
5249 ("map %p start = %jx, end = %jx", map,
5250 (uintmax_t)entry->start, (uintmax_t)entry->end));
5251 KASSERT(entry->left == header ||
5252 entry->left->start < entry->start,
5253 ("map %p left->start = %jx, start = %jx", map,
5254 (uintmax_t)entry->left->start, (uintmax_t)entry->start));
5255 KASSERT(entry->right == header ||
5256 entry->start < entry->right->start,
5257 ("map %p start = %jx, right->start = %jx", map,
5258 (uintmax_t)entry->start, (uintmax_t)entry->right->start));
5259 cur = map->root;
5260 lbound = ubound = header;
5261 for (;;) {
5262 if (entry->start < cur->start) {
5263 ubound = cur;
5264 cur = cur->left;
5265 KASSERT(cur != lbound,
5266 ("map %p cannot find %jx",
5267 map, (uintmax_t)entry->start));
5268 } else if (cur->end <= entry->start) {
5269 lbound = cur;
5270 cur = cur->right;
5271 KASSERT(cur != ubound,
5272 ("map %p cannot find %jx",
5273 map, (uintmax_t)entry->start));
5274 } else {
5275 KASSERT(cur == entry,
5276 ("map %p cannot find %jx",
5277 map, (uintmax_t)entry->start));
5278 break;
5279 }
5280 }
5281 max_left = vm_map_entry_max_free_left(entry, lbound);
5282 max_right = vm_map_entry_max_free_right(entry, ubound);
5283 KASSERT(entry->max_free == vm_size_max(max_left, max_right),
5284 ("map %p max = %jx, max_left = %jx, max_right = %jx", map,
5285 (uintmax_t)entry->max_free,
5286 (uintmax_t)max_left, (uintmax_t)max_right));
5287 prev = entry;
5288 }
5289 KASSERT(prev->end <= entry->start,
5290 ("map %p prev->end = %jx, start = %jx", map,
5291 (uintmax_t)prev->end, (uintmax_t)entry->start));
5292 }
5293 #endif
5294
5295 #include "opt_ddb.h"
5296 #ifdef DDB
5297 #include <sys/kernel.h>
5298
5299 #include <ddb/ddb.h>
5300
5301 static void
5302 vm_map_print(vm_map_t map)
5303 {
5304 vm_map_entry_t entry, prev;
5305
5306 db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
5307 (void *)map,
5308 (void *)map->pmap, map->nentries, map->timestamp);
5309
5310 db_indent += 2;
5311 prev = &map->header;
5312 VM_MAP_ENTRY_FOREACH(entry, map) {
5313 db_iprintf("map entry %p: start=%p, end=%p, eflags=%#x, \n",
5314 (void *)entry, (void *)entry->start, (void *)entry->end,
5315 entry->eflags);
5316 {
5317 static const char * const inheritance_name[4] =
5318 {"share", "copy", "none", "donate_copy"};
5319
5320 db_iprintf(" prot=%x/%x/%s",
5321 entry->protection,
5322 entry->max_protection,
5323 inheritance_name[(int)(unsigned char)
5324 entry->inheritance]);
5325 if (entry->wired_count != 0)
5326 db_printf(", wired");
5327 }
5328 if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
5329 db_printf(", share=%p, offset=0x%jx\n",
5330 (void *)entry->object.sub_map,
5331 (uintmax_t)entry->offset);
5332 if (prev == &map->header ||
5333 prev->object.sub_map !=
5334 entry->object.sub_map) {
5335 db_indent += 2;
5336 vm_map_print((vm_map_t)entry->object.sub_map);
5337 db_indent -= 2;
5338 }
5339 } else {
5340 if (entry->cred != NULL)
5341 db_printf(", ruid %d", entry->cred->cr_ruid);
5342 db_printf(", object=%p, offset=0x%jx",
5343 (void *)entry->object.vm_object,
5344 (uintmax_t)entry->offset);
5345 if (entry->object.vm_object && entry->object.vm_object->cred)
5346 db_printf(", obj ruid %d charge %jx",
5347 entry->object.vm_object->cred->cr_ruid,
5348 (uintmax_t)entry->object.vm_object->charge);
5349 if (entry->eflags & MAP_ENTRY_COW)
5350 db_printf(", copy (%s)",
5351 (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
5352 db_printf("\n");
5353
5354 if (prev == &map->header ||
5355 prev->object.vm_object !=
5356 entry->object.vm_object) {
5357 db_indent += 2;
5358 vm_object_print((db_expr_t)(intptr_t)
5359 entry->object.vm_object,
5360 0, 0, (char *)0);
5361 db_indent -= 2;
5362 }
5363 }
5364 prev = entry;
5365 }
5366 db_indent -= 2;
5367 }
5368
5369 DB_SHOW_COMMAND(map, map)
5370 {
5371
5372 if (!have_addr) {
5373 db_printf("usage: show map <addr>\n");
5374 return;
5375 }
5376 vm_map_print((vm_map_t)addr);
5377 }
5378
5379 DB_SHOW_COMMAND(procvm, procvm)
5380 {
5381 struct proc *p;
5382
5383 if (have_addr) {
5384 p = db_lookup_proc(addr);
5385 } else {
5386 p = curproc;
5387 }
5388
5389 db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
5390 (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
5391 (void *)vmspace_pmap(p->p_vmspace));
5392
5393 vm_map_print((vm_map_t)&p->p_vmspace->vm_map);
5394 }
5395
5396 #endif /* DDB */
Cache object: 4ae8fe3b700e846ec9def0291404fb37
|