FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c
1 /*-
2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
8 * Science Department.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
35 *
36 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
37 */
38
39 /*
40 * Mapped file (mmap) interface to VM
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD: releng/7.4/sys/vm/vm_mmap.c 206603 2010-04-14 15:23:16Z jhb $");
45
46 #include "opt_compat.h"
47 #include "opt_hwpmc_hooks.h"
48 #include "opt_mac.h"
49
50 #include <sys/param.h>
51 #include <sys/systm.h>
52 #include <sys/kernel.h>
53 #include <sys/lock.h>
54 #include <sys/mutex.h>
55 #include <sys/sysproto.h>
56 #include <sys/filedesc.h>
57 #include <sys/priv.h>
58 #include <sys/proc.h>
59 #include <sys/resource.h>
60 #include <sys/resourcevar.h>
61 #include <sys/vnode.h>
62 #include <sys/fcntl.h>
63 #include <sys/file.h>
64 #include <sys/mman.h>
65 #include <sys/mount.h>
66 #include <sys/conf.h>
67 #include <sys/stat.h>
68 #include <sys/vmmeter.h>
69 #include <sys/sysctl.h>
70
71 #include <security/mac/mac_framework.h>
72
73 #include <vm/vm.h>
74 #include <vm/vm_param.h>
75 #include <vm/pmap.h>
76 #include <vm/vm_map.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_page.h>
79 #include <vm/vm_pager.h>
80 #include <vm/vm_pageout.h>
81 #include <vm/vm_extern.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_kern.h>
84
85 #ifdef HWPMC_HOOKS
86 #include <sys/pmckern.h>
87 #endif
88
89 #ifndef _SYS_SYSPROTO_H_
90 struct sbrk_args {
91 int incr;
92 };
93 #endif
94
95 static int max_proc_mmap;
96 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
97
98 /*
99 * Set the maximum number of vm_map_entry structures per process. Roughly
100 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
101 * of our KVM malloc space still results in generous limits. We want a
102 * default that is good enough to prevent the kernel running out of resources
103 * if attacked from compromised user account but generous enough such that
104 * multi-threaded processes are not unduly inconvenienced.
105 */
106 static void vmmapentry_rsrc_init(void *);
107 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init,
108 NULL);
109
110 static void
111 vmmapentry_rsrc_init(dummy)
112 void *dummy;
113 {
114 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
115 max_proc_mmap /= 100;
116 }
117
118 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
119 int *, struct vnode *, vm_ooffset_t *, vm_object_t *);
120 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
121 int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
122
123 /*
124 * MPSAFE
125 */
126 /* ARGSUSED */
127 int
128 sbrk(td, uap)
129 struct thread *td;
130 struct sbrk_args *uap;
131 {
132 /* Not yet implemented */
133 return (EOPNOTSUPP);
134 }
135
136 #ifndef _SYS_SYSPROTO_H_
137 struct sstk_args {
138 int incr;
139 };
140 #endif
141
142 /*
143 * MPSAFE
144 */
145 /* ARGSUSED */
146 int
147 sstk(td, uap)
148 struct thread *td;
149 struct sstk_args *uap;
150 {
151 /* Not yet implemented */
152 return (EOPNOTSUPP);
153 }
154
155 #if defined(COMPAT_43)
156 #ifndef _SYS_SYSPROTO_H_
157 struct getpagesize_args {
158 int dummy;
159 };
160 #endif
161
162 /* ARGSUSED */
163 int
164 ogetpagesize(td, uap)
165 struct thread *td;
166 struct getpagesize_args *uap;
167 {
168 /* MP SAFE */
169 td->td_retval[0] = PAGE_SIZE;
170 return (0);
171 }
172 #endif /* COMPAT_43 */
173
174
175 /*
176 * Memory Map (mmap) system call. Note that the file offset
177 * and address are allowed to be NOT page aligned, though if
178 * the MAP_FIXED flag it set, both must have the same remainder
179 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
180 * page-aligned, the actual mapping starts at trunc_page(addr)
181 * and the return value is adjusted up by the page offset.
182 *
183 * Generally speaking, only character devices which are themselves
184 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
185 * there would be no cache coherency between a descriptor and a VM mapping
186 * both to the same character device.
187 *
188 * Block devices can be mmap'd no matter what they represent. Cache coherency
189 * is maintained as long as you do not write directly to the underlying
190 * character device.
191 */
192 #ifndef _SYS_SYSPROTO_H_
193 struct mmap_args {
194 void *addr;
195 size_t len;
196 int prot;
197 int flags;
198 int fd;
199 long pad;
200 off_t pos;
201 };
202 #endif
203
204 /*
205 * MPSAFE
206 */
207 int
208 mmap(td, uap)
209 struct thread *td;
210 struct mmap_args *uap;
211 {
212 #ifdef HWPMC_HOOKS
213 struct pmckern_map_in pkm;
214 #endif
215 struct file *fp;
216 struct vnode *vp;
217 vm_offset_t addr;
218 vm_size_t size, pageoff;
219 vm_prot_t prot, maxprot;
220 void *handle;
221 objtype_t handle_type;
222 int flags, error;
223 off_t pos;
224 struct vmspace *vms = td->td_proc->p_vmspace;
225
226 addr = (vm_offset_t) uap->addr;
227 size = uap->len;
228 prot = uap->prot & VM_PROT_ALL;
229 flags = uap->flags;
230 pos = uap->pos;
231
232 fp = NULL;
233 /* make sure mapping fits into numeric range etc */
234 if ((ssize_t) uap->len < 0 ||
235 ((flags & MAP_ANON) && (uap->fd != -1 || pos != 0)))
236 return (EINVAL);
237
238 if (flags & MAP_STACK) {
239 if ((uap->fd != -1) ||
240 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
241 return (EINVAL);
242 flags |= MAP_ANON;
243 pos = 0;
244 }
245
246 /*
247 * Align the file position to a page boundary,
248 * and save its page offset component.
249 */
250 pageoff = (pos & PAGE_MASK);
251 pos -= pageoff;
252
253 /* Adjust size for rounding (on both ends). */
254 size += pageoff; /* low end... */
255 size = (vm_size_t) round_page(size); /* hi end */
256
257 /*
258 * Check for illegal addresses. Watch out for address wrap... Note
259 * that VM_*_ADDRESS are not constants due to casts (argh).
260 */
261 if (flags & MAP_FIXED) {
262 /*
263 * The specified address must have the same remainder
264 * as the file offset taken modulo PAGE_SIZE, so it
265 * should be aligned after adjustment by pageoff.
266 */
267 addr -= pageoff;
268 if (addr & PAGE_MASK)
269 return (EINVAL);
270 /* Address range must be all in user VM space. */
271 if (addr < vm_map_min(&vms->vm_map) ||
272 addr + size > vm_map_max(&vms->vm_map))
273 return (EINVAL);
274 if (addr + size < addr)
275 return (EINVAL);
276 } else {
277 /*
278 * XXX for non-fixed mappings where no hint is provided or
279 * the hint would fall in the potential heap space,
280 * place it after the end of the largest possible heap.
281 *
282 * There should really be a pmap call to determine a reasonable
283 * location.
284 */
285 PROC_LOCK(td->td_proc);
286 if (addr == 0 ||
287 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
288 addr < round_page((vm_offset_t)vms->vm_daddr +
289 lim_max(td->td_proc, RLIMIT_DATA))))
290 addr = round_page((vm_offset_t)vms->vm_daddr +
291 lim_max(td->td_proc, RLIMIT_DATA));
292 PROC_UNLOCK(td->td_proc);
293 }
294 if (flags & MAP_ANON) {
295 /*
296 * Mapping blank space is trivial.
297 */
298 handle = NULL;
299 handle_type = OBJT_DEFAULT;
300 maxprot = VM_PROT_ALL;
301 } else {
302 /*
303 * Mapping file, get fp for validation. Obtain vnode and make
304 * sure it is of appropriate type.
305 * don't let the descriptor disappear on us if we block
306 */
307 if ((error = fget(td, uap->fd, &fp)) != 0)
308 goto done;
309 if (fp->f_type != DTYPE_VNODE) {
310 error = ENODEV;
311 goto done;
312 }
313 /*
314 * POSIX shared-memory objects are defined to have
315 * kernel persistence, and are not defined to support
316 * read(2)/write(2) -- or even open(2). Thus, we can
317 * use MAP_ASYNC to trade on-disk coherence for speed.
318 * The shm_open(3) library routine turns on the FPOSIXSHM
319 * flag to request this behavior.
320 */
321 if (fp->f_flag & FPOSIXSHM)
322 flags |= MAP_NOSYNC;
323 vp = fp->f_vnode;
324 /*
325 * Ensure that file and memory protections are
326 * compatible. Note that we only worry about
327 * writability if mapping is shared; in this case,
328 * current and max prot are dictated by the open file.
329 * XXX use the vnode instead? Problem is: what
330 * credentials do we use for determination? What if
331 * proc does a setuid?
332 */
333 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
334 maxprot = VM_PROT_NONE;
335 else
336 maxprot = VM_PROT_EXECUTE;
337 if (fp->f_flag & FREAD) {
338 maxprot |= VM_PROT_READ;
339 } else if (prot & PROT_READ) {
340 error = EACCES;
341 goto done;
342 }
343 /*
344 * If we are sharing potential changes (either via
345 * MAP_SHARED or via the implicit sharing of character
346 * device mappings), and we are trying to get write
347 * permission although we opened it without asking
348 * for it, bail out.
349 */
350 if ((flags & MAP_SHARED) != 0) {
351 if ((fp->f_flag & FWRITE) != 0) {
352 maxprot |= VM_PROT_WRITE;
353 } else if ((prot & PROT_WRITE) != 0) {
354 error = EACCES;
355 goto done;
356 }
357 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
358 maxprot |= VM_PROT_WRITE;
359 }
360 handle = (void *)vp;
361 handle_type = OBJT_VNODE;
362 }
363
364 /*
365 * Do not allow more then a certain number of vm_map_entry structures
366 * per process. Scale with the number of rforks sharing the map
367 * to make the limit reasonable for threads.
368 */
369 if (max_proc_mmap &&
370 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
371 error = ENOMEM;
372 goto done;
373 }
374
375 td->td_fpop = fp;
376 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
377 flags, handle_type, handle, pos);
378 td->td_fpop = NULL;
379 #ifdef HWPMC_HOOKS
380 /* inform hwpmc(4) if an executable is being mapped */
381 if (error == 0 && handle_type == OBJT_VNODE &&
382 (prot & PROT_EXEC)) {
383 pkm.pm_file = handle;
384 pkm.pm_address = (uintptr_t) addr;
385 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
386 }
387 #endif
388 if (error == 0)
389 td->td_retval[0] = (register_t) (addr + pageoff);
390 done:
391 if (fp)
392 fdrop(fp, td);
393
394 return (error);
395 }
396
397 int
398 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
399 {
400 struct mmap_args oargs;
401
402 oargs.addr = uap->addr;
403 oargs.len = uap->len;
404 oargs.prot = uap->prot;
405 oargs.flags = uap->flags;
406 oargs.fd = uap->fd;
407 oargs.pos = uap->pos;
408 return (mmap(td, &oargs));
409 }
410
411 #ifdef COMPAT_43
412 #ifndef _SYS_SYSPROTO_H_
413 struct ommap_args {
414 caddr_t addr;
415 int len;
416 int prot;
417 int flags;
418 int fd;
419 long pos;
420 };
421 #endif
422 int
423 ommap(td, uap)
424 struct thread *td;
425 struct ommap_args *uap;
426 {
427 struct mmap_args nargs;
428 static const char cvtbsdprot[8] = {
429 0,
430 PROT_EXEC,
431 PROT_WRITE,
432 PROT_EXEC | PROT_WRITE,
433 PROT_READ,
434 PROT_EXEC | PROT_READ,
435 PROT_WRITE | PROT_READ,
436 PROT_EXEC | PROT_WRITE | PROT_READ,
437 };
438
439 #define OMAP_ANON 0x0002
440 #define OMAP_COPY 0x0020
441 #define OMAP_SHARED 0x0010
442 #define OMAP_FIXED 0x0100
443
444 nargs.addr = uap->addr;
445 nargs.len = uap->len;
446 nargs.prot = cvtbsdprot[uap->prot & 0x7];
447 nargs.flags = 0;
448 if (uap->flags & OMAP_ANON)
449 nargs.flags |= MAP_ANON;
450 if (uap->flags & OMAP_COPY)
451 nargs.flags |= MAP_COPY;
452 if (uap->flags & OMAP_SHARED)
453 nargs.flags |= MAP_SHARED;
454 else
455 nargs.flags |= MAP_PRIVATE;
456 if (uap->flags & OMAP_FIXED)
457 nargs.flags |= MAP_FIXED;
458 nargs.fd = uap->fd;
459 nargs.pos = uap->pos;
460 return (mmap(td, &nargs));
461 }
462 #endif /* COMPAT_43 */
463
464
465 #ifndef _SYS_SYSPROTO_H_
466 struct msync_args {
467 void *addr;
468 size_t len;
469 int flags;
470 };
471 #endif
472 /*
473 * MPSAFE
474 */
475 int
476 msync(td, uap)
477 struct thread *td;
478 struct msync_args *uap;
479 {
480 vm_offset_t addr;
481 vm_size_t size, pageoff;
482 int flags;
483 vm_map_t map;
484 int rv;
485
486 addr = (vm_offset_t) uap->addr;
487 size = uap->len;
488 flags = uap->flags;
489
490 pageoff = (addr & PAGE_MASK);
491 addr -= pageoff;
492 size += pageoff;
493 size = (vm_size_t) round_page(size);
494 if (addr + size < addr)
495 return (EINVAL);
496
497 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
498 return (EINVAL);
499
500 map = &td->td_proc->p_vmspace->vm_map;
501
502 /*
503 * Clean the pages and interpret the return value.
504 */
505 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
506 (flags & MS_INVALIDATE) != 0);
507 switch (rv) {
508 case KERN_SUCCESS:
509 return (0);
510 case KERN_INVALID_ADDRESS:
511 return (EINVAL); /* Sun returns ENOMEM? */
512 case KERN_INVALID_ARGUMENT:
513 return (EBUSY);
514 default:
515 return (EINVAL);
516 }
517 }
518
519 #ifndef _SYS_SYSPROTO_H_
520 struct munmap_args {
521 void *addr;
522 size_t len;
523 };
524 #endif
525 /*
526 * MPSAFE
527 */
528 int
529 munmap(td, uap)
530 struct thread *td;
531 struct munmap_args *uap;
532 {
533 #ifdef HWPMC_HOOKS
534 struct pmckern_map_out pkm;
535 vm_map_entry_t entry;
536 #endif
537 vm_offset_t addr;
538 vm_size_t size, pageoff;
539 vm_map_t map;
540
541 addr = (vm_offset_t) uap->addr;
542 size = uap->len;
543 if (size == 0)
544 return (EINVAL);
545
546 pageoff = (addr & PAGE_MASK);
547 addr -= pageoff;
548 size += pageoff;
549 size = (vm_size_t) round_page(size);
550 if (addr + size < addr)
551 return (EINVAL);
552
553 /*
554 * Check for illegal addresses. Watch out for address wrap...
555 */
556 map = &td->td_proc->p_vmspace->vm_map;
557 if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
558 return (EINVAL);
559 vm_map_lock(map);
560 #ifdef HWPMC_HOOKS
561 /*
562 * Inform hwpmc if the address range being unmapped contains
563 * an executable region.
564 */
565 if (vm_map_lookup_entry(map, addr, &entry)) {
566 for (;
567 entry != &map->header && entry->start < addr + size;
568 entry = entry->next) {
569 if (vm_map_check_protection(map, entry->start,
570 entry->end, VM_PROT_EXECUTE) == TRUE) {
571 pkm.pm_address = (uintptr_t) addr;
572 pkm.pm_size = (size_t) size;
573 PMC_CALL_HOOK(td, PMC_FN_MUNMAP,
574 (void *) &pkm);
575 break;
576 }
577 }
578 }
579 #endif
580 /* returns nothing but KERN_SUCCESS anyway */
581 vm_map_delete(map, addr, addr + size);
582 vm_map_unlock(map);
583 return (0);
584 }
585
586 #ifndef _SYS_SYSPROTO_H_
587 struct mprotect_args {
588 const void *addr;
589 size_t len;
590 int prot;
591 };
592 #endif
593 /*
594 * MPSAFE
595 */
596 int
597 mprotect(td, uap)
598 struct thread *td;
599 struct mprotect_args *uap;
600 {
601 vm_offset_t addr;
602 vm_size_t size, pageoff;
603 vm_prot_t prot;
604
605 addr = (vm_offset_t) uap->addr;
606 size = uap->len;
607 prot = uap->prot & VM_PROT_ALL;
608 #if defined(VM_PROT_READ_IS_EXEC)
609 if (prot & VM_PROT_READ)
610 prot |= VM_PROT_EXECUTE;
611 #endif
612
613 pageoff = (addr & PAGE_MASK);
614 addr -= pageoff;
615 size += pageoff;
616 size = (vm_size_t) round_page(size);
617 if (addr + size < addr)
618 return (EINVAL);
619
620 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
621 addr + size, prot, FALSE)) {
622 case KERN_SUCCESS:
623 return (0);
624 case KERN_PROTECTION_FAILURE:
625 return (EACCES);
626 }
627 return (EINVAL);
628 }
629
630 #ifndef _SYS_SYSPROTO_H_
631 struct minherit_args {
632 void *addr;
633 size_t len;
634 int inherit;
635 };
636 #endif
637 /*
638 * MPSAFE
639 */
640 int
641 minherit(td, uap)
642 struct thread *td;
643 struct minherit_args *uap;
644 {
645 vm_offset_t addr;
646 vm_size_t size, pageoff;
647 vm_inherit_t inherit;
648
649 addr = (vm_offset_t)uap->addr;
650 size = uap->len;
651 inherit = uap->inherit;
652
653 pageoff = (addr & PAGE_MASK);
654 addr -= pageoff;
655 size += pageoff;
656 size = (vm_size_t) round_page(size);
657 if (addr + size < addr)
658 return (EINVAL);
659
660 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
661 addr + size, inherit)) {
662 case KERN_SUCCESS:
663 return (0);
664 case KERN_PROTECTION_FAILURE:
665 return (EACCES);
666 }
667 return (EINVAL);
668 }
669
670 #ifndef _SYS_SYSPROTO_H_
671 struct madvise_args {
672 void *addr;
673 size_t len;
674 int behav;
675 };
676 #endif
677
678 /*
679 * MPSAFE
680 */
681 /* ARGSUSED */
682 int
683 madvise(td, uap)
684 struct thread *td;
685 struct madvise_args *uap;
686 {
687 vm_offset_t start, end;
688 vm_map_t map;
689 struct proc *p;
690 int error;
691
692 /*
693 * Check for our special case, advising the swap pager we are
694 * "immortal."
695 */
696 if (uap->behav == MADV_PROTECT) {
697 error = priv_check(td, PRIV_VM_MADV_PROTECT);
698 if (error == 0) {
699 p = td->td_proc;
700 PROC_LOCK(p);
701 p->p_flag |= P_PROTECTED;
702 PROC_UNLOCK(p);
703 }
704 return (error);
705 }
706 /*
707 * Check for illegal behavior
708 */
709 if (uap->behav < 0 || uap->behav > MADV_CORE)
710 return (EINVAL);
711 /*
712 * Check for illegal addresses. Watch out for address wrap... Note
713 * that VM_*_ADDRESS are not constants due to casts (argh).
714 */
715 map = &td->td_proc->p_vmspace->vm_map;
716 if ((vm_offset_t)uap->addr < vm_map_min(map) ||
717 (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
718 return (EINVAL);
719 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
720 return (EINVAL);
721
722 /*
723 * Since this routine is only advisory, we default to conservative
724 * behavior.
725 */
726 start = trunc_page((vm_offset_t) uap->addr);
727 end = round_page((vm_offset_t) uap->addr + uap->len);
728
729 if (vm_map_madvise(map, start, end, uap->behav))
730 return (EINVAL);
731 return (0);
732 }
733
734 #ifndef _SYS_SYSPROTO_H_
735 struct mincore_args {
736 const void *addr;
737 size_t len;
738 char *vec;
739 };
740 #endif
741
742 /*
743 * MPSAFE
744 */
745 /* ARGSUSED */
746 int
747 mincore(td, uap)
748 struct thread *td;
749 struct mincore_args *uap;
750 {
751 vm_offset_t addr, first_addr;
752 vm_offset_t end, cend;
753 pmap_t pmap;
754 vm_map_t map;
755 char *vec;
756 int error = 0;
757 int vecindex, lastvecindex;
758 vm_map_entry_t current;
759 vm_map_entry_t entry;
760 int mincoreinfo;
761 unsigned int timestamp;
762
763 /*
764 * Make sure that the addresses presented are valid for user
765 * mode.
766 */
767 first_addr = addr = trunc_page((vm_offset_t) uap->addr);
768 end = addr + (vm_size_t)round_page(uap->len);
769 map = &td->td_proc->p_vmspace->vm_map;
770 if (end > vm_map_max(map) || end < addr)
771 return (ENOMEM);
772
773 /*
774 * Address of byte vector
775 */
776 vec = uap->vec;
777
778 pmap = vmspace_pmap(td->td_proc->p_vmspace);
779
780 vm_map_lock_read(map);
781 RestartScan:
782 timestamp = map->timestamp;
783
784 if (!vm_map_lookup_entry(map, addr, &entry)) {
785 vm_map_unlock_read(map);
786 return (ENOMEM);
787 }
788
789 /*
790 * Do this on a map entry basis so that if the pages are not
791 * in the current processes address space, we can easily look
792 * up the pages elsewhere.
793 */
794 lastvecindex = -1;
795 for (current = entry;
796 (current != &map->header) && (current->start < end);
797 current = current->next) {
798
799 /*
800 * check for contiguity
801 */
802 if (current->end < end &&
803 (entry->next == &map->header ||
804 current->next->start > current->end)) {
805 vm_map_unlock_read(map);
806 return (ENOMEM);
807 }
808
809 /*
810 * ignore submaps (for now) or null objects
811 */
812 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
813 current->object.vm_object == NULL)
814 continue;
815
816 /*
817 * limit this scan to the current map entry and the
818 * limits for the mincore call
819 */
820 if (addr < current->start)
821 addr = current->start;
822 cend = current->end;
823 if (cend > end)
824 cend = end;
825
826 /*
827 * scan this entry one page at a time
828 */
829 while (addr < cend) {
830 /*
831 * Check pmap first, it is likely faster, also
832 * it can provide info as to whether we are the
833 * one referencing or modifying the page.
834 */
835 mincoreinfo = pmap_mincore(pmap, addr);
836 if (!mincoreinfo) {
837 vm_pindex_t pindex;
838 vm_ooffset_t offset;
839 vm_page_t m;
840 /*
841 * calculate the page index into the object
842 */
843 offset = current->offset + (addr - current->start);
844 pindex = OFF_TO_IDX(offset);
845 VM_OBJECT_LOCK(current->object.vm_object);
846 m = vm_page_lookup(current->object.vm_object,
847 pindex);
848 /*
849 * if the page is resident, then gather information about
850 * it.
851 */
852 if (m != NULL && m->valid != 0) {
853 mincoreinfo = MINCORE_INCORE;
854 vm_page_lock_queues();
855 if (m->dirty ||
856 pmap_is_modified(m))
857 mincoreinfo |= MINCORE_MODIFIED_OTHER;
858 if ((m->flags & PG_REFERENCED) ||
859 pmap_ts_referenced(m)) {
860 vm_page_flag_set(m, PG_REFERENCED);
861 mincoreinfo |= MINCORE_REFERENCED_OTHER;
862 }
863 vm_page_unlock_queues();
864 }
865 VM_OBJECT_UNLOCK(current->object.vm_object);
866 }
867
868 /*
869 * subyte may page fault. In case it needs to modify
870 * the map, we release the lock.
871 */
872 vm_map_unlock_read(map);
873
874 /*
875 * calculate index into user supplied byte vector
876 */
877 vecindex = OFF_TO_IDX(addr - first_addr);
878
879 /*
880 * If we have skipped map entries, we need to make sure that
881 * the byte vector is zeroed for those skipped entries.
882 */
883 while ((lastvecindex + 1) < vecindex) {
884 error = subyte(vec + lastvecindex, 0);
885 if (error) {
886 error = EFAULT;
887 goto done2;
888 }
889 ++lastvecindex;
890 }
891
892 /*
893 * Pass the page information to the user
894 */
895 error = subyte(vec + vecindex, mincoreinfo);
896 if (error) {
897 error = EFAULT;
898 goto done2;
899 }
900
901 /*
902 * If the map has changed, due to the subyte, the previous
903 * output may be invalid.
904 */
905 vm_map_lock_read(map);
906 if (timestamp != map->timestamp)
907 goto RestartScan;
908
909 lastvecindex = vecindex;
910 addr += PAGE_SIZE;
911 }
912 }
913
914 /*
915 * subyte may page fault. In case it needs to modify
916 * the map, we release the lock.
917 */
918 vm_map_unlock_read(map);
919
920 /*
921 * Zero the last entries in the byte vector.
922 */
923 vecindex = OFF_TO_IDX(end - first_addr);
924 while ((lastvecindex + 1) < vecindex) {
925 error = subyte(vec + lastvecindex, 0);
926 if (error) {
927 error = EFAULT;
928 goto done2;
929 }
930 ++lastvecindex;
931 }
932
933 /*
934 * If the map has changed, due to the subyte, the previous
935 * output may be invalid.
936 */
937 vm_map_lock_read(map);
938 if (timestamp != map->timestamp)
939 goto RestartScan;
940 vm_map_unlock_read(map);
941 done2:
942 return (error);
943 }
944
945 #ifndef _SYS_SYSPROTO_H_
946 struct mlock_args {
947 const void *addr;
948 size_t len;
949 };
950 #endif
951 /*
952 * MPSAFE
953 */
954 int
955 mlock(td, uap)
956 struct thread *td;
957 struct mlock_args *uap;
958 {
959 struct proc *proc;
960 vm_offset_t addr, end, last, start;
961 vm_size_t npages, size;
962 int error;
963
964 error = priv_check(td, PRIV_VM_MLOCK);
965 if (error)
966 return (error);
967 addr = (vm_offset_t)uap->addr;
968 size = uap->len;
969 last = addr + size;
970 start = trunc_page(addr);
971 end = round_page(last);
972 if (last < addr || end < addr)
973 return (EINVAL);
974 npages = atop(end - start);
975 if (npages > vm_page_max_wired)
976 return (ENOMEM);
977 proc = td->td_proc;
978 PROC_LOCK(proc);
979 if (ptoa(npages +
980 pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
981 lim_cur(proc, RLIMIT_MEMLOCK)) {
982 PROC_UNLOCK(proc);
983 return (ENOMEM);
984 }
985 PROC_UNLOCK(proc);
986 if (npages + cnt.v_wire_count > vm_page_max_wired)
987 return (EAGAIN);
988 error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
989 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
990 return (error == KERN_SUCCESS ? 0 : ENOMEM);
991 }
992
993 #ifndef _SYS_SYSPROTO_H_
994 struct mlockall_args {
995 int how;
996 };
997 #endif
998
999 /*
1000 * MPSAFE
1001 */
1002 int
1003 mlockall(td, uap)
1004 struct thread *td;
1005 struct mlockall_args *uap;
1006 {
1007 vm_map_t map;
1008 int error;
1009
1010 map = &td->td_proc->p_vmspace->vm_map;
1011 error = 0;
1012
1013 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1014 return (EINVAL);
1015
1016 #if 0
1017 /*
1018 * If wiring all pages in the process would cause it to exceed
1019 * a hard resource limit, return ENOMEM.
1020 */
1021 PROC_LOCK(td->td_proc);
1022 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
1023 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) {
1024 PROC_UNLOCK(td->td_proc);
1025 return (ENOMEM);
1026 }
1027 PROC_UNLOCK(td->td_proc);
1028 #else
1029 error = priv_check(td, PRIV_VM_MLOCK);
1030 if (error)
1031 return (error);
1032 #endif
1033
1034 if (uap->how & MCL_FUTURE) {
1035 vm_map_lock(map);
1036 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1037 vm_map_unlock(map);
1038 error = 0;
1039 }
1040
1041 if (uap->how & MCL_CURRENT) {
1042 /*
1043 * P1003.1-2001 mandates that all currently mapped pages
1044 * will be memory resident and locked (wired) upon return
1045 * from mlockall(). vm_map_wire() will wire pages, by
1046 * calling vm_fault_wire() for each page in the region.
1047 */
1048 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1049 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1050 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1051 }
1052
1053 return (error);
1054 }
1055
1056 #ifndef _SYS_SYSPROTO_H_
1057 struct munlockall_args {
1058 register_t dummy;
1059 };
1060 #endif
1061
1062 /*
1063 * MPSAFE
1064 */
1065 int
1066 munlockall(td, uap)
1067 struct thread *td;
1068 struct munlockall_args *uap;
1069 {
1070 vm_map_t map;
1071 int error;
1072
1073 map = &td->td_proc->p_vmspace->vm_map;
1074 error = priv_check(td, PRIV_VM_MUNLOCK);
1075 if (error)
1076 return (error);
1077
1078 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1079 vm_map_lock(map);
1080 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1081 vm_map_unlock(map);
1082
1083 /* Forcibly unwire all pages. */
1084 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1085 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1086
1087 return (error);
1088 }
1089
1090 #ifndef _SYS_SYSPROTO_H_
1091 struct munlock_args {
1092 const void *addr;
1093 size_t len;
1094 };
1095 #endif
1096 /*
1097 * MPSAFE
1098 */
1099 int
1100 munlock(td, uap)
1101 struct thread *td;
1102 struct munlock_args *uap;
1103 {
1104 vm_offset_t addr, end, last, start;
1105 vm_size_t size;
1106 int error;
1107
1108 error = priv_check(td, PRIV_VM_MUNLOCK);
1109 if (error)
1110 return (error);
1111 addr = (vm_offset_t)uap->addr;
1112 size = uap->len;
1113 last = addr + size;
1114 start = trunc_page(addr);
1115 end = round_page(last);
1116 if (last < addr || end < addr)
1117 return (EINVAL);
1118 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
1119 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
1120 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1121 }
1122
1123 /*
1124 * vm_mmap_vnode()
1125 *
1126 * MPSAFE
1127 *
1128 * Helper function for vm_mmap. Perform sanity check specific for mmap
1129 * operations on vnodes.
1130 */
1131 int
1132 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
1133 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1134 struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp)
1135 {
1136 struct vattr va;
1137 vm_object_t obj;
1138 vm_offset_t foff;
1139 struct mount *mp;
1140 int error, flags;
1141 int vfslocked;
1142
1143 mp = vp->v_mount;
1144 vfslocked = VFS_LOCK_GIANT(mp);
1145 if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
1146 VFS_UNLOCK_GIANT(vfslocked);
1147 return (error);
1148 }
1149 foff = *foffp;
1150 flags = *flagsp;
1151 obj = vp->v_object;
1152 if (vp->v_type == VREG) {
1153 /*
1154 * Get the proper underlying object
1155 */
1156 if (obj == NULL) {
1157 error = EINVAL;
1158 goto done;
1159 }
1160 if (obj->handle != vp) {
1161 vput(vp);
1162 vp = (struct vnode*)obj->handle;
1163 vget(vp, LK_EXCLUSIVE, td);
1164 }
1165 } else if (vp->v_type == VCHR) {
1166 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
1167 vp->v_rdev, foffp, objp);
1168 if (error == 0)
1169 goto mark_atime;
1170 goto done;
1171 } else {
1172 error = EINVAL;
1173 goto done;
1174 }
1175 if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) {
1176 goto done;
1177 }
1178 #ifdef MAC
1179 error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags);
1180 if (error != 0)
1181 goto done;
1182 #endif
1183 if ((flags & MAP_SHARED) != 0) {
1184 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
1185 if (prot & PROT_WRITE) {
1186 error = EPERM;
1187 goto done;
1188 }
1189 *maxprotp &= ~VM_PROT_WRITE;
1190 }
1191 }
1192 /*
1193 * If it is a regular file without any references
1194 * we do not need to sync it.
1195 * Adjust object size to be the size of actual file.
1196 */
1197 objsize = round_page(va.va_size);
1198 if (va.va_nlink == 0)
1199 flags |= MAP_NOSYNC;
1200 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff);
1201 if (obj == NULL) {
1202 error = ENOMEM;
1203 goto done;
1204 }
1205 *objp = obj;
1206 *flagsp = flags;
1207
1208 mark_atime:
1209 vfs_mark_atime(vp, td);
1210
1211 done:
1212 vput(vp);
1213 VFS_UNLOCK_GIANT(vfslocked);
1214 return (error);
1215 }
1216
1217 /*
1218 * vm_mmap_cdev()
1219 *
1220 * MPSAFE
1221 *
1222 * Helper function for vm_mmap. Perform sanity check specific for mmap
1223 * operations on cdevs.
1224 */
1225 int
1226 vm_mmap_cdev(struct thread *td, vm_size_t objsize,
1227 vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
1228 struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
1229 {
1230 vm_object_t obj;
1231 struct cdevsw *dsw;
1232 int error, flags;
1233
1234 flags = *flagsp;
1235
1236 dsw = dev_refthread(cdev);
1237 if (dsw == NULL)
1238 return (ENXIO);
1239 if (dsw->d_flags & D_MMAP_ANON) {
1240 dev_relthread(cdev);
1241 *maxprotp = VM_PROT_ALL;
1242 *flagsp |= MAP_ANON;
1243 return (0);
1244 }
1245 /*
1246 * cdevs do not provide private mappings of any kind.
1247 */
1248 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
1249 (prot & PROT_WRITE) != 0) {
1250 dev_relthread(cdev);
1251 return (EACCES);
1252 }
1253 if (flags & (MAP_PRIVATE|MAP_COPY)) {
1254 dev_relthread(cdev);
1255 return (EINVAL);
1256 }
1257 /*
1258 * Force device mappings to be shared.
1259 */
1260 flags |= MAP_SHARED;
1261 #ifdef MAC_XXX
1262 error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
1263 if (error != 0) {
1264 dev_relthread(cdev);
1265 return (error);
1266 }
1267 #endif
1268 /*
1269 * First, try d_mmap_single(). If that is not implemented
1270 * (returns ENODEV), fall back to using the device pager.
1271 * Note that d_mmap_single() must return a reference to the
1272 * object (it needs to bump the reference count of the object
1273 * it returns somehow).
1274 *
1275 * XXX assumes VM_PROT_* == PROT_*
1276 */
1277 error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
1278 dev_relthread(cdev);
1279 if (error != ENODEV)
1280 return (error);
1281 obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff);
1282 if (obj == NULL)
1283 return (EINVAL);
1284 *objp = obj;
1285 *flagsp = flags;
1286 return (0);
1287 }
1288
1289 /*
1290 * vm_mmap()
1291 *
1292 * MPSAFE
1293 *
1294 * Internal version of mmap. Currently used by mmap, exec, and sys5
1295 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
1296 */
1297 int
1298 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1299 vm_prot_t maxprot, int flags,
1300 objtype_t handle_type, void *handle,
1301 vm_ooffset_t foff)
1302 {
1303 boolean_t fitit;
1304 vm_object_t object = NULL;
1305 int rv = KERN_SUCCESS;
1306 int docow, error;
1307 struct thread *td = curthread;
1308
1309 if (size == 0)
1310 return (0);
1311
1312 size = round_page(size);
1313
1314 PROC_LOCK(td->td_proc);
1315 if (td->td_proc->p_vmspace->vm_map.size + size >
1316 lim_cur(td->td_proc, RLIMIT_VMEM)) {
1317 PROC_UNLOCK(td->td_proc);
1318 return(ENOMEM);
1319 }
1320 PROC_UNLOCK(td->td_proc);
1321
1322 /*
1323 * We currently can only deal with page aligned file offsets.
1324 * The check is here rather than in the syscall because the
1325 * kernel calls this function internally for other mmaping
1326 * operations (such as in exec) and non-aligned offsets will
1327 * cause pmap inconsistencies...so we want to be sure to
1328 * disallow this in all cases.
1329 */
1330 if (foff & PAGE_MASK)
1331 return (EINVAL);
1332
1333 if ((flags & MAP_FIXED) == 0) {
1334 fitit = TRUE;
1335 *addr = round_page(*addr);
1336 } else {
1337 if (*addr != trunc_page(*addr))
1338 return (EINVAL);
1339 fitit = FALSE;
1340 }
1341 /*
1342 * Lookup/allocate object.
1343 */
1344 switch (handle_type) {
1345 case OBJT_DEVICE:
1346 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
1347 handle, &foff, &object);
1348 break;
1349 case OBJT_VNODE:
1350 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
1351 handle, &foff, &object);
1352 break;
1353 case OBJT_DEFAULT:
1354 if (handle == NULL) {
1355 error = 0;
1356 break;
1357 }
1358 /* FALLTHROUGH */
1359 default:
1360 error = EINVAL;
1361 break;
1362 }
1363 if (error)
1364 return (error);
1365 if (flags & MAP_ANON) {
1366 object = NULL;
1367 docow = 0;
1368 /*
1369 * Unnamed anonymous regions always start at 0.
1370 */
1371 if (handle == 0)
1372 foff = 0;
1373 } else {
1374 docow = MAP_PREFAULT_PARTIAL;
1375 }
1376
1377 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1378 docow |= MAP_COPY_ON_WRITE;
1379 if (flags & MAP_NOSYNC)
1380 docow |= MAP_DISABLE_SYNCER;
1381 if (flags & MAP_NOCORE)
1382 docow |= MAP_DISABLE_COREDUMP;
1383
1384 #if defined(VM_PROT_READ_IS_EXEC)
1385 if (prot & VM_PROT_READ)
1386 prot |= VM_PROT_EXECUTE;
1387
1388 if (maxprot & VM_PROT_READ)
1389 maxprot |= VM_PROT_EXECUTE;
1390 #endif
1391
1392 if (flags & MAP_STACK)
1393 rv = vm_map_stack(map, *addr, size, prot, maxprot,
1394 docow | MAP_STACK_GROWS_DOWN);
1395 else if (fitit)
1396 rv = vm_map_find(map, object, foff, addr, size,
1397 object != NULL && object->type == OBJT_DEVICE ?
1398 VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
1399 else
1400 rv = vm_map_fixed(map, object, foff, *addr, size,
1401 prot, maxprot, docow);
1402
1403 if (rv != KERN_SUCCESS) {
1404 /*
1405 * Lose the object reference. Will destroy the
1406 * object if it's an unnamed anonymous mapping
1407 * or named anonymous without other references.
1408 */
1409 vm_object_deallocate(object);
1410 } else if (flags & MAP_SHARED) {
1411 /*
1412 * Shared memory is also shared with children.
1413 */
1414 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
1415 if (rv != KERN_SUCCESS)
1416 (void) vm_map_remove(map, *addr, *addr + size);
1417 }
1418
1419 /*
1420 * If the process has requested that all future mappings
1421 * be wired, then heed this.
1422 */
1423 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
1424 vm_map_wire(map, *addr, *addr + size,
1425 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
1426
1427 switch (rv) {
1428 case KERN_SUCCESS:
1429 return (0);
1430 case KERN_INVALID_ADDRESS:
1431 case KERN_NO_SPACE:
1432 return (ENOMEM);
1433 case KERN_PROTECTION_FAILURE:
1434 return (EACCES);
1435 default:
1436 return (EINVAL);
1437 }
1438 }
Cache object: 31c4c2ef05274e09791f14cc726f2dde
|