FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c
1 /*
2 * Copyright (c) 1988 University of Utah.
3 * Copyright (c) 1991, 1993
4 * The Regents of the University of California. All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * the Systems Programming Group of the University of Utah Computer
8 * Science Department.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39 *
40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
41 */
42
43 /*
44 * Mapped file (mmap) interface to VM
45 */
46
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD: releng/5.2/sys/vm/vm_mmap.c 123469 2003-12-11 20:30:15Z kan $");
49
50 #include "opt_compat.h"
51 #include "opt_mac.h"
52
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/sysproto.h>
59 #include <sys/filedesc.h>
60 #include <sys/proc.h>
61 #include <sys/resource.h>
62 #include <sys/resourcevar.h>
63 #include <sys/vnode.h>
64 #include <sys/fcntl.h>
65 #include <sys/file.h>
66 #include <sys/mac.h>
67 #include <sys/mman.h>
68 #include <sys/conf.h>
69 #include <sys/stat.h>
70 #include <sys/vmmeter.h>
71 #include <sys/sysctl.h>
72
73 #include <vm/vm.h>
74 #include <vm/vm_param.h>
75 #include <vm/pmap.h>
76 #include <vm/vm_map.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_page.h>
79 #include <vm/vm_pager.h>
80 #include <vm/vm_pageout.h>
81 #include <vm/vm_extern.h>
82 #include <vm/vm_page.h>
83 #include <vm/vm_kern.h>
84
85 #ifndef _SYS_SYSPROTO_H_
86 struct sbrk_args {
87 int incr;
88 };
89 #endif
90
91 static int max_proc_mmap;
92 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
93
94 /*
95 * Set the maximum number of vm_map_entry structures per process. Roughly
96 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
97 * of our KVM malloc space still results in generous limits. We want a
98 * default that is good enough to prevent the kernel running out of resources
99 * if attacked from compromised user account but generous enough such that
100 * multi-threaded processes are not unduly inconvenienced.
101 */
102 static void vmmapentry_rsrc_init(void *);
103 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
104
105 static void
106 vmmapentry_rsrc_init(dummy)
107 void *dummy;
108 {
109 max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
110 max_proc_mmap /= 100;
111 }
112
113 /*
114 * MPSAFE
115 */
116 /* ARGSUSED */
117 int
118 sbrk(td, uap)
119 struct thread *td;
120 struct sbrk_args *uap;
121 {
122 /* Not yet implemented */
123 /* mtx_lock(&Giant); */
124 /* mtx_unlock(&Giant); */
125 return (EOPNOTSUPP);
126 }
127
128 #ifndef _SYS_SYSPROTO_H_
129 struct sstk_args {
130 int incr;
131 };
132 #endif
133
134 /*
135 * MPSAFE
136 */
137 /* ARGSUSED */
138 int
139 sstk(td, uap)
140 struct thread *td;
141 struct sstk_args *uap;
142 {
143 /* Not yet implemented */
144 /* mtx_lock(&Giant); */
145 /* mtx_unlock(&Giant); */
146 return (EOPNOTSUPP);
147 }
148
149 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
150 #ifndef _SYS_SYSPROTO_H_
151 struct getpagesize_args {
152 int dummy;
153 };
154 #endif
155
156 /* ARGSUSED */
157 int
158 ogetpagesize(td, uap)
159 struct thread *td;
160 struct getpagesize_args *uap;
161 {
162 /* MP SAFE */
163 td->td_retval[0] = PAGE_SIZE;
164 return (0);
165 }
166 #endif /* COMPAT_43 || COMPAT_SUNOS */
167
168
169 /*
170 * Memory Map (mmap) system call. Note that the file offset
171 * and address are allowed to be NOT page aligned, though if
172 * the MAP_FIXED flag it set, both must have the same remainder
173 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
174 * page-aligned, the actual mapping starts at trunc_page(addr)
175 * and the return value is adjusted up by the page offset.
176 *
177 * Generally speaking, only character devices which are themselves
178 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise
179 * there would be no cache coherency between a descriptor and a VM mapping
180 * both to the same character device.
181 *
182 * Block devices can be mmap'd no matter what they represent. Cache coherency
183 * is maintained as long as you do not write directly to the underlying
184 * character device.
185 */
186 #ifndef _SYS_SYSPROTO_H_
187 struct mmap_args {
188 void *addr;
189 size_t len;
190 int prot;
191 int flags;
192 int fd;
193 long pad;
194 off_t pos;
195 };
196 #endif
197
198 /*
199 * MPSAFE
200 */
201 int
202 mmap(td, uap)
203 struct thread *td;
204 struct mmap_args *uap;
205 {
206 struct file *fp = NULL;
207 struct vnode *vp;
208 vm_offset_t addr;
209 vm_size_t size, pageoff;
210 vm_prot_t prot, maxprot;
211 void *handle;
212 int flags, error;
213 int disablexworkaround;
214 off_t pos;
215 struct vmspace *vms = td->td_proc->p_vmspace;
216 vm_object_t obj;
217
218 addr = (vm_offset_t) uap->addr;
219 size = uap->len;
220 prot = uap->prot & VM_PROT_ALL;
221 flags = uap->flags;
222 pos = uap->pos;
223
224 vp = NULL;
225 fp = NULL;
226 /* make sure mapping fits into numeric range etc */
227 if ((ssize_t) uap->len < 0 ||
228 ((flags & MAP_ANON) && uap->fd != -1))
229 return (EINVAL);
230
231 if (flags & MAP_STACK) {
232 if ((uap->fd != -1) ||
233 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
234 return (EINVAL);
235 flags |= MAP_ANON;
236 pos = 0;
237 }
238
239 /*
240 * Align the file position to a page boundary,
241 * and save its page offset component.
242 */
243 pageoff = (pos & PAGE_MASK);
244 pos -= pageoff;
245
246 /* Adjust size for rounding (on both ends). */
247 size += pageoff; /* low end... */
248 size = (vm_size_t) round_page(size); /* hi end */
249
250 /*
251 * Check for illegal addresses. Watch out for address wrap... Note
252 * that VM_*_ADDRESS are not constants due to casts (argh).
253 */
254 if (flags & MAP_FIXED) {
255 /*
256 * The specified address must have the same remainder
257 * as the file offset taken modulo PAGE_SIZE, so it
258 * should be aligned after adjustment by pageoff.
259 */
260 addr -= pageoff;
261 if (addr & PAGE_MASK)
262 return (EINVAL);
263 /* Address range must be all in user VM space. */
264 if (addr < vm_map_min(&vms->vm_map) ||
265 addr + size > vm_map_max(&vms->vm_map))
266 return (EINVAL);
267 if (addr + size < addr)
268 return (EINVAL);
269 }
270 /*
271 * XXX for non-fixed mappings where no hint is provided or
272 * the hint would fall in the potential heap space,
273 * place it after the end of the largest possible heap.
274 *
275 * There should really be a pmap call to determine a reasonable
276 * location.
277 */
278 else if (addr == 0 ||
279 (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
280 addr < round_page((vm_offset_t)vms->vm_daddr +
281 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max)))
282 addr = round_page((vm_offset_t)vms->vm_daddr +
283 td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max);
284
285 mtx_lock(&Giant); /* syscall marked mp-safe but isn't */
286 do {
287 if (flags & MAP_ANON) {
288 /*
289 * Mapping blank space is trivial.
290 */
291 handle = NULL;
292 maxprot = VM_PROT_ALL;
293 pos = 0;
294 break;
295 }
296 /*
297 * Mapping file, get fp for validation. Obtain vnode and make
298 * sure it is of appropriate type.
299 * don't let the descriptor disappear on us if we block
300 */
301 if ((error = fget(td, uap->fd, &fp)) != 0)
302 goto done;
303 if (fp->f_type != DTYPE_VNODE) {
304 error = EINVAL;
305 goto done;
306 }
307
308 /*
309 * POSIX shared-memory objects are defined to have
310 * kernel persistence, and are not defined to support
311 * read(2)/write(2) -- or even open(2). Thus, we can
312 * use MAP_ASYNC to trade on-disk coherence for speed.
313 * The shm_open(3) library routine turns on the FPOSIXSHM
314 * flag to request this behavior.
315 */
316 if (fp->f_flag & FPOSIXSHM)
317 flags |= MAP_NOSYNC;
318 vp = fp->f_vnode;
319 error = vget(vp, LK_EXCLUSIVE, td);
320 if (error)
321 goto done;
322 if (vp->v_type != VREG && vp->v_type != VCHR) {
323 error = EINVAL;
324 goto done;
325 }
326 if (vp->v_type == VREG) {
327 /*
328 * Get the proper underlying object
329 */
330 if (VOP_GETVOBJECT(vp, &obj) != 0) {
331 error = EINVAL;
332 goto done;
333 }
334 if (obj->handle != vp) {
335 vput(vp);
336 vp = (struct vnode*)obj->handle;
337 vget(vp, LK_EXCLUSIVE, td);
338 }
339 }
340 /*
341 * XXX hack to handle use of /dev/zero to map anon memory (ala
342 * SunOS).
343 */
344 if ((vp->v_type == VCHR) &&
345 (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
346 handle = NULL;
347 maxprot = VM_PROT_ALL;
348 flags |= MAP_ANON;
349 pos = 0;
350 break;
351 }
352 /*
353 * cdevs does not provide private mappings of any kind.
354 */
355 /*
356 * However, for XIG X server to continue to work,
357 * we should allow the superuser to do it anyway.
358 * We only allow it at securelevel < 1.
359 * (Because the XIG X server writes directly to video
360 * memory via /dev/mem, it should never work at any
361 * other securelevel.
362 * XXX this will have to go
363 */
364 if (securelevel_ge(td->td_ucred, 1))
365 disablexworkaround = 1;
366 else
367 disablexworkaround = suser(td);
368 if (vp->v_type == VCHR && disablexworkaround &&
369 (flags & (MAP_PRIVATE|MAP_COPY))) {
370 error = EINVAL;
371 goto done;
372 }
373 /*
374 * Ensure that file and memory protections are
375 * compatible. Note that we only worry about
376 * writability if mapping is shared; in this case,
377 * current and max prot are dictated by the open file.
378 * XXX use the vnode instead? Problem is: what
379 * credentials do we use for determination? What if
380 * proc does a setuid?
381 */
382 maxprot = VM_PROT_EXECUTE; /* ??? */
383 if (fp->f_flag & FREAD) {
384 maxprot |= VM_PROT_READ;
385 } else if (prot & PROT_READ) {
386 error = EACCES;
387 goto done;
388 }
389 /*
390 * If we are sharing potential changes (either via
391 * MAP_SHARED or via the implicit sharing of character
392 * device mappings), and we are trying to get write
393 * permission although we opened it without asking
394 * for it, bail out. Check for superuser, only if
395 * we're at securelevel < 1, to allow the XIG X server
396 * to continue to work.
397 */
398 if ((flags & MAP_SHARED) != 0 ||
399 (vp->v_type == VCHR && disablexworkaround)) {
400 if ((fp->f_flag & FWRITE) != 0) {
401 struct vattr va;
402 if ((error =
403 VOP_GETATTR(vp, &va,
404 td->td_ucred, td))) {
405 goto done;
406 }
407 if ((va.va_flags &
408 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
409 maxprot |= VM_PROT_WRITE;
410 } else if (prot & PROT_WRITE) {
411 error = EPERM;
412 goto done;
413 }
414 } else if ((prot & PROT_WRITE) != 0) {
415 error = EACCES;
416 goto done;
417 }
418 } else {
419 maxprot |= VM_PROT_WRITE;
420 }
421
422 handle = (void *)vp;
423 } while (0);
424
425 /*
426 * Do not allow more then a certain number of vm_map_entry structures
427 * per process. Scale with the number of rforks sharing the map
428 * to make the limit reasonable for threads.
429 */
430 if (max_proc_mmap &&
431 vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
432 error = ENOMEM;
433 goto done;
434 }
435
436 error = 0;
437 #ifdef MAC
438 if (handle != NULL && (flags & MAP_SHARED) != 0) {
439 error = mac_check_vnode_mmap(td->td_ucred,
440 (struct vnode *)handle, prot);
441 }
442 #endif
443 if (vp != NULL) {
444 vput(vp);
445 vp = NULL;
446 }
447 mtx_unlock(&Giant);
448 if (error == 0)
449 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
450 flags, handle, pos);
451 mtx_lock(&Giant);
452 if (error == 0)
453 td->td_retval[0] = (register_t) (addr + pageoff);
454 done:
455 if (vp)
456 vput(vp);
457 mtx_unlock(&Giant);
458 if (fp)
459 fdrop(fp, td);
460
461 return (error);
462 }
463
464 #ifdef COMPAT_43
465 #ifndef _SYS_SYSPROTO_H_
466 struct ommap_args {
467 caddr_t addr;
468 int len;
469 int prot;
470 int flags;
471 int fd;
472 long pos;
473 };
474 #endif
475 int
476 ommap(td, uap)
477 struct thread *td;
478 struct ommap_args *uap;
479 {
480 struct mmap_args nargs;
481 static const char cvtbsdprot[8] = {
482 0,
483 PROT_EXEC,
484 PROT_WRITE,
485 PROT_EXEC | PROT_WRITE,
486 PROT_READ,
487 PROT_EXEC | PROT_READ,
488 PROT_WRITE | PROT_READ,
489 PROT_EXEC | PROT_WRITE | PROT_READ,
490 };
491
492 #define OMAP_ANON 0x0002
493 #define OMAP_COPY 0x0020
494 #define OMAP_SHARED 0x0010
495 #define OMAP_FIXED 0x0100
496
497 nargs.addr = uap->addr;
498 nargs.len = uap->len;
499 nargs.prot = cvtbsdprot[uap->prot & 0x7];
500 nargs.flags = 0;
501 if (uap->flags & OMAP_ANON)
502 nargs.flags |= MAP_ANON;
503 if (uap->flags & OMAP_COPY)
504 nargs.flags |= MAP_COPY;
505 if (uap->flags & OMAP_SHARED)
506 nargs.flags |= MAP_SHARED;
507 else
508 nargs.flags |= MAP_PRIVATE;
509 if (uap->flags & OMAP_FIXED)
510 nargs.flags |= MAP_FIXED;
511 nargs.fd = uap->fd;
512 nargs.pos = uap->pos;
513 return (mmap(td, &nargs));
514 }
515 #endif /* COMPAT_43 */
516
517
518 #ifndef _SYS_SYSPROTO_H_
519 struct msync_args {
520 void *addr;
521 int len;
522 int flags;
523 };
524 #endif
525 /*
526 * MPSAFE
527 */
528 int
529 msync(td, uap)
530 struct thread *td;
531 struct msync_args *uap;
532 {
533 vm_offset_t addr;
534 vm_size_t size, pageoff;
535 int flags;
536 vm_map_t map;
537 int rv;
538
539 addr = (vm_offset_t) uap->addr;
540 size = uap->len;
541 flags = uap->flags;
542
543 pageoff = (addr & PAGE_MASK);
544 addr -= pageoff;
545 size += pageoff;
546 size = (vm_size_t) round_page(size);
547 if (addr + size < addr)
548 return (EINVAL);
549
550 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
551 return (EINVAL);
552
553 map = &td->td_proc->p_vmspace->vm_map;
554
555 /*
556 * Clean the pages and interpret the return value.
557 */
558 rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
559 (flags & MS_INVALIDATE) != 0);
560 switch (rv) {
561 case KERN_SUCCESS:
562 return (0);
563 case KERN_INVALID_ADDRESS:
564 return (EINVAL); /* Sun returns ENOMEM? */
565 case KERN_INVALID_ARGUMENT:
566 return (EBUSY);
567 default:
568 return (EINVAL);
569 }
570 }
571
572 #ifndef _SYS_SYSPROTO_H_
573 struct munmap_args {
574 void *addr;
575 size_t len;
576 };
577 #endif
578 /*
579 * MPSAFE
580 */
581 int
582 munmap(td, uap)
583 struct thread *td;
584 struct munmap_args *uap;
585 {
586 vm_offset_t addr;
587 vm_size_t size, pageoff;
588 vm_map_t map;
589
590 addr = (vm_offset_t) uap->addr;
591 size = uap->len;
592 if (size == 0)
593 return (EINVAL);
594
595 pageoff = (addr & PAGE_MASK);
596 addr -= pageoff;
597 size += pageoff;
598 size = (vm_size_t) round_page(size);
599 if (addr + size < addr)
600 return (EINVAL);
601
602 /*
603 * Check for illegal addresses. Watch out for address wrap...
604 */
605 map = &td->td_proc->p_vmspace->vm_map;
606 if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
607 return (EINVAL);
608 vm_map_lock(map);
609 /*
610 * Make sure entire range is allocated.
611 */
612 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
613 vm_map_unlock(map);
614 return (EINVAL);
615 }
616 /* returns nothing but KERN_SUCCESS anyway */
617 vm_map_delete(map, addr, addr + size);
618 vm_map_unlock(map);
619 return (0);
620 }
621
622 #ifndef _SYS_SYSPROTO_H_
623 struct mprotect_args {
624 const void *addr;
625 size_t len;
626 int prot;
627 };
628 #endif
629 /*
630 * MPSAFE
631 */
632 int
633 mprotect(td, uap)
634 struct thread *td;
635 struct mprotect_args *uap;
636 {
637 vm_offset_t addr;
638 vm_size_t size, pageoff;
639 vm_prot_t prot;
640
641 addr = (vm_offset_t) uap->addr;
642 size = uap->len;
643 prot = uap->prot & VM_PROT_ALL;
644 #if defined(VM_PROT_READ_IS_EXEC)
645 if (prot & VM_PROT_READ)
646 prot |= VM_PROT_EXECUTE;
647 #endif
648
649 pageoff = (addr & PAGE_MASK);
650 addr -= pageoff;
651 size += pageoff;
652 size = (vm_size_t) round_page(size);
653 if (addr + size < addr)
654 return (EINVAL);
655
656 switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
657 addr + size, prot, FALSE)) {
658 case KERN_SUCCESS:
659 return (0);
660 case KERN_PROTECTION_FAILURE:
661 return (EACCES);
662 }
663 return (EINVAL);
664 }
665
666 #ifndef _SYS_SYSPROTO_H_
667 struct minherit_args {
668 void *addr;
669 size_t len;
670 int inherit;
671 };
672 #endif
673 /*
674 * MPSAFE
675 */
676 int
677 minherit(td, uap)
678 struct thread *td;
679 struct minherit_args *uap;
680 {
681 vm_offset_t addr;
682 vm_size_t size, pageoff;
683 vm_inherit_t inherit;
684
685 addr = (vm_offset_t)uap->addr;
686 size = uap->len;
687 inherit = uap->inherit;
688
689 pageoff = (addr & PAGE_MASK);
690 addr -= pageoff;
691 size += pageoff;
692 size = (vm_size_t) round_page(size);
693 if (addr + size < addr)
694 return (EINVAL);
695
696 switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
697 addr + size, inherit)) {
698 case KERN_SUCCESS:
699 return (0);
700 case KERN_PROTECTION_FAILURE:
701 return (EACCES);
702 }
703 return (EINVAL);
704 }
705
706 #ifndef _SYS_SYSPROTO_H_
707 struct madvise_args {
708 void *addr;
709 size_t len;
710 int behav;
711 };
712 #endif
713
714 /*
715 * MPSAFE
716 */
717 /* ARGSUSED */
718 int
719 madvise(td, uap)
720 struct thread *td;
721 struct madvise_args *uap;
722 {
723 vm_offset_t start, end;
724 vm_map_t map;
725 struct proc *p;
726 int error;
727
728 /*
729 * Check for our special case, advising the swap pager we are
730 * "immortal."
731 */
732 if (uap->behav == MADV_PROTECT) {
733 error = suser(td);
734 if (error == 0) {
735 p = td->td_proc;
736 PROC_LOCK(p);
737 p->p_flag |= P_PROTECTED;
738 PROC_UNLOCK(p);
739 }
740 return (error);
741 }
742 /*
743 * Check for illegal behavior
744 */
745 if (uap->behav < 0 || uap->behav > MADV_CORE)
746 return (EINVAL);
747 /*
748 * Check for illegal addresses. Watch out for address wrap... Note
749 * that VM_*_ADDRESS are not constants due to casts (argh).
750 */
751 map = &td->td_proc->p_vmspace->vm_map;
752 if ((vm_offset_t)uap->addr < vm_map_min(map) ||
753 (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
754 return (EINVAL);
755 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
756 return (EINVAL);
757
758 /*
759 * Since this routine is only advisory, we default to conservative
760 * behavior.
761 */
762 start = trunc_page((vm_offset_t) uap->addr);
763 end = round_page((vm_offset_t) uap->addr + uap->len);
764
765 if (vm_map_madvise(map, start, end, uap->behav))
766 return (EINVAL);
767 return (0);
768 }
769
770 #ifndef _SYS_SYSPROTO_H_
771 struct mincore_args {
772 const void *addr;
773 size_t len;
774 char *vec;
775 };
776 #endif
777
778 /*
779 * MPSAFE
780 */
781 /* ARGSUSED */
782 int
783 mincore(td, uap)
784 struct thread *td;
785 struct mincore_args *uap;
786 {
787 vm_offset_t addr, first_addr;
788 vm_offset_t end, cend;
789 pmap_t pmap;
790 vm_map_t map;
791 char *vec;
792 int error = 0;
793 int vecindex, lastvecindex;
794 vm_map_entry_t current;
795 vm_map_entry_t entry;
796 int mincoreinfo;
797 unsigned int timestamp;
798
799 /*
800 * Make sure that the addresses presented are valid for user
801 * mode.
802 */
803 first_addr = addr = trunc_page((vm_offset_t) uap->addr);
804 end = addr + (vm_size_t)round_page(uap->len);
805 map = &td->td_proc->p_vmspace->vm_map;
806 if (end > vm_map_max(map) || end < addr)
807 return (EINVAL);
808
809 /*
810 * Address of byte vector
811 */
812 vec = uap->vec;
813
814 pmap = vmspace_pmap(td->td_proc->p_vmspace);
815
816 vm_map_lock_read(map);
817 RestartScan:
818 timestamp = map->timestamp;
819
820 if (!vm_map_lookup_entry(map, addr, &entry))
821 entry = entry->next;
822
823 /*
824 * Do this on a map entry basis so that if the pages are not
825 * in the current processes address space, we can easily look
826 * up the pages elsewhere.
827 */
828 lastvecindex = -1;
829 for (current = entry;
830 (current != &map->header) && (current->start < end);
831 current = current->next) {
832
833 /*
834 * ignore submaps (for now) or null objects
835 */
836 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
837 current->object.vm_object == NULL)
838 continue;
839
840 /*
841 * limit this scan to the current map entry and the
842 * limits for the mincore call
843 */
844 if (addr < current->start)
845 addr = current->start;
846 cend = current->end;
847 if (cend > end)
848 cend = end;
849
850 /*
851 * scan this entry one page at a time
852 */
853 while (addr < cend) {
854 /*
855 * Check pmap first, it is likely faster, also
856 * it can provide info as to whether we are the
857 * one referencing or modifying the page.
858 */
859 mtx_lock(&Giant);
860 mincoreinfo = pmap_mincore(pmap, addr);
861 mtx_unlock(&Giant);
862 if (!mincoreinfo) {
863 vm_pindex_t pindex;
864 vm_ooffset_t offset;
865 vm_page_t m;
866 /*
867 * calculate the page index into the object
868 */
869 offset = current->offset + (addr - current->start);
870 pindex = OFF_TO_IDX(offset);
871 VM_OBJECT_LOCK(current->object.vm_object);
872 m = vm_page_lookup(current->object.vm_object,
873 pindex);
874 /*
875 * if the page is resident, then gather information about
876 * it.
877 */
878 if (m) {
879 mincoreinfo = MINCORE_INCORE;
880 vm_page_lock_queues();
881 if (m->dirty ||
882 pmap_is_modified(m))
883 mincoreinfo |= MINCORE_MODIFIED_OTHER;
884 if ((m->flags & PG_REFERENCED) ||
885 pmap_ts_referenced(m)) {
886 vm_page_flag_set(m, PG_REFERENCED);
887 mincoreinfo |= MINCORE_REFERENCED_OTHER;
888 }
889 vm_page_unlock_queues();
890 }
891 VM_OBJECT_UNLOCK(current->object.vm_object);
892 }
893
894 /*
895 * subyte may page fault. In case it needs to modify
896 * the map, we release the lock.
897 */
898 vm_map_unlock_read(map);
899
900 /*
901 * calculate index into user supplied byte vector
902 */
903 vecindex = OFF_TO_IDX(addr - first_addr);
904
905 /*
906 * If we have skipped map entries, we need to make sure that
907 * the byte vector is zeroed for those skipped entries.
908 */
909 while ((lastvecindex + 1) < vecindex) {
910 error = subyte(vec + lastvecindex, 0);
911 if (error) {
912 error = EFAULT;
913 goto done2;
914 }
915 ++lastvecindex;
916 }
917
918 /*
919 * Pass the page information to the user
920 */
921 error = subyte(vec + vecindex, mincoreinfo);
922 if (error) {
923 error = EFAULT;
924 goto done2;
925 }
926
927 /*
928 * If the map has changed, due to the subyte, the previous
929 * output may be invalid.
930 */
931 vm_map_lock_read(map);
932 if (timestamp != map->timestamp)
933 goto RestartScan;
934
935 lastvecindex = vecindex;
936 addr += PAGE_SIZE;
937 }
938 }
939
940 /*
941 * subyte may page fault. In case it needs to modify
942 * the map, we release the lock.
943 */
944 vm_map_unlock_read(map);
945
946 /*
947 * Zero the last entries in the byte vector.
948 */
949 vecindex = OFF_TO_IDX(end - first_addr);
950 while ((lastvecindex + 1) < vecindex) {
951 error = subyte(vec + lastvecindex, 0);
952 if (error) {
953 error = EFAULT;
954 goto done2;
955 }
956 ++lastvecindex;
957 }
958
959 /*
960 * If the map has changed, due to the subyte, the previous
961 * output may be invalid.
962 */
963 vm_map_lock_read(map);
964 if (timestamp != map->timestamp)
965 goto RestartScan;
966 vm_map_unlock_read(map);
967 done2:
968 return (error);
969 }
970
971 #ifndef _SYS_SYSPROTO_H_
972 struct mlock_args {
973 const void *addr;
974 size_t len;
975 };
976 #endif
977 /*
978 * MPSAFE
979 */
980 int
981 mlock(td, uap)
982 struct thread *td;
983 struct mlock_args *uap;
984 {
985 vm_offset_t addr;
986 vm_size_t size, pageoff;
987 int error;
988
989 addr = (vm_offset_t) uap->addr;
990 size = uap->len;
991
992 pageoff = (addr & PAGE_MASK);
993 addr -= pageoff;
994 size += pageoff;
995 size = (vm_size_t) round_page(size);
996
997 /* disable wrap around */
998 if (addr + size < addr)
999 return (EINVAL);
1000
1001 if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
1002 return (EAGAIN);
1003
1004 #if 0
1005 if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
1006 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
1007 return (ENOMEM);
1008 #else
1009 error = suser(td);
1010 if (error)
1011 return (error);
1012 #endif
1013
1014 error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
1015 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
1016 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1017 }
1018
1019 #ifndef _SYS_SYSPROTO_H_
1020 struct mlockall_args {
1021 int how;
1022 };
1023 #endif
1024
1025 /*
1026 * MPSAFE
1027 */
1028 int
1029 mlockall(td, uap)
1030 struct thread *td;
1031 struct mlockall_args *uap;
1032 {
1033 vm_map_t map;
1034 int error;
1035
1036 map = &td->td_proc->p_vmspace->vm_map;
1037 error = 0;
1038
1039 if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
1040 return (EINVAL);
1041
1042 #if 0
1043 /*
1044 * If wiring all pages in the process would cause it to exceed
1045 * a hard resource limit, return ENOMEM.
1046 */
1047 if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
1048 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur))
1049 return (ENOMEM);
1050 #else
1051 error = suser(td);
1052 if (error)
1053 return (error);
1054 #endif
1055
1056 if (uap->how & MCL_FUTURE) {
1057 vm_map_lock(map);
1058 vm_map_modflags(map, MAP_WIREFUTURE, 0);
1059 vm_map_unlock(map);
1060 error = 0;
1061 }
1062
1063 if (uap->how & MCL_CURRENT) {
1064 /*
1065 * P1003.1-2001 mandates that all currently mapped pages
1066 * will be memory resident and locked (wired) upon return
1067 * from mlockall(). vm_map_wire() will wire pages, by
1068 * calling vm_fault_wire() for each page in the region.
1069 */
1070 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
1071 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1072 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
1073 }
1074
1075 return (error);
1076 }
1077
1078 #ifndef _SYS_SYSPROTO_H_
1079 struct munlockall_args {
1080 register_t dummy;
1081 };
1082 #endif
1083
1084 /*
1085 * MPSAFE
1086 */
1087 int
1088 munlockall(td, uap)
1089 struct thread *td;
1090 struct munlockall_args *uap;
1091 {
1092 vm_map_t map;
1093 int error;
1094
1095 map = &td->td_proc->p_vmspace->vm_map;
1096 error = suser(td);
1097 if (error)
1098 return (error);
1099
1100 /* Clear the MAP_WIREFUTURE flag from this vm_map. */
1101 vm_map_lock(map);
1102 vm_map_modflags(map, 0, MAP_WIREFUTURE);
1103 vm_map_unlock(map);
1104
1105 /* Forcibly unwire all pages. */
1106 error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
1107 VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
1108
1109 return (error);
1110 }
1111
1112 #ifndef _SYS_SYSPROTO_H_
1113 struct munlock_args {
1114 const void *addr;
1115 size_t len;
1116 };
1117 #endif
1118 /*
1119 * MPSAFE
1120 */
1121 int
1122 munlock(td, uap)
1123 struct thread *td;
1124 struct munlock_args *uap;
1125 {
1126 vm_offset_t addr;
1127 vm_size_t size, pageoff;
1128 int error;
1129
1130 addr = (vm_offset_t) uap->addr;
1131 size = uap->len;
1132
1133 pageoff = (addr & PAGE_MASK);
1134 addr -= pageoff;
1135 size += pageoff;
1136 size = (vm_size_t) round_page(size);
1137
1138 /* disable wrap around */
1139 if (addr + size < addr)
1140 return (EINVAL);
1141
1142 error = suser(td);
1143 if (error)
1144 return (error);
1145
1146 error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
1147 addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
1148 return (error == KERN_SUCCESS ? 0 : ENOMEM);
1149 }
1150
1151 /*
1152 * vm_mmap()
1153 *
1154 * MPSAFE
1155 *
1156 * Internal version of mmap. Currently used by mmap, exec, and sys5
1157 * shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
1158 */
1159 int
1160 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
1161 vm_prot_t maxprot, int flags,
1162 void *handle,
1163 vm_ooffset_t foff)
1164 {
1165 boolean_t fitit;
1166 vm_object_t object;
1167 struct vnode *vp = NULL;
1168 objtype_t type;
1169 int rv = KERN_SUCCESS;
1170 vm_ooffset_t objsize;
1171 int docow, error;
1172 struct thread *td = curthread;
1173
1174 if (size == 0)
1175 return (0);
1176
1177 objsize = size = round_page(size);
1178
1179 if (td->td_proc->p_vmspace->vm_map.size + size >
1180 td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
1181 return(ENOMEM);
1182 }
1183
1184 /*
1185 * We currently can only deal with page aligned file offsets.
1186 * The check is here rather than in the syscall because the
1187 * kernel calls this function internally for other mmaping
1188 * operations (such as in exec) and non-aligned offsets will
1189 * cause pmap inconsistencies...so we want to be sure to
1190 * disallow this in all cases.
1191 */
1192 if (foff & PAGE_MASK)
1193 return (EINVAL);
1194
1195 if ((flags & MAP_FIXED) == 0) {
1196 fitit = TRUE;
1197 *addr = round_page(*addr);
1198 } else {
1199 if (*addr != trunc_page(*addr))
1200 return (EINVAL);
1201 fitit = FALSE;
1202 (void) vm_map_remove(map, *addr, *addr + size);
1203 }
1204
1205 /*
1206 * Lookup/allocate object.
1207 */
1208 if (flags & MAP_ANON) {
1209 type = OBJT_DEFAULT;
1210 /*
1211 * Unnamed anonymous regions always start at 0.
1212 */
1213 if (handle == 0)
1214 foff = 0;
1215 } else {
1216 vp = (struct vnode *) handle;
1217 mtx_lock(&Giant);
1218 error = vget(vp, LK_EXCLUSIVE, td);
1219 if (error) {
1220 mtx_unlock(&Giant);
1221 return (error);
1222 }
1223 if (vp->v_type == VCHR) {
1224 type = OBJT_DEVICE;
1225 handle = vp->v_rdev;
1226 vput(vp);
1227 mtx_unlock(&Giant);
1228 } else {
1229 struct vattr vat;
1230
1231 error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
1232 if (error) {
1233 vput(vp);
1234 mtx_unlock(&Giant);
1235 return (error);
1236 }
1237 objsize = round_page(vat.va_size);
1238 type = OBJT_VNODE;
1239 /*
1240 * if it is a regular file without any references
1241 * we do not need to sync it.
1242 */
1243 if (vp->v_type == VREG && vat.va_nlink == 0) {
1244 flags |= MAP_NOSYNC;
1245 }
1246 }
1247 }
1248
1249 if (handle == NULL) {
1250 object = NULL;
1251 docow = 0;
1252 } else {
1253 object = vm_pager_allocate(type,
1254 handle, objsize, prot, foff);
1255 if (type == OBJT_VNODE) {
1256 vput(vp);
1257 mtx_unlock(&Giant);
1258 }
1259 if (object == NULL) {
1260 return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
1261 }
1262 docow = MAP_PREFAULT_PARTIAL;
1263 }
1264
1265 /*
1266 * Force device mappings to be shared.
1267 */
1268 if (type == OBJT_DEVICE) {
1269 flags &= ~(MAP_PRIVATE|MAP_COPY);
1270 flags |= MAP_SHARED;
1271 }
1272
1273 if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
1274 docow |= MAP_COPY_ON_WRITE;
1275 if (flags & MAP_NOSYNC)
1276 docow |= MAP_DISABLE_SYNCER;
1277 if (flags & MAP_NOCORE)
1278 docow |= MAP_DISABLE_COREDUMP;
1279
1280 #if defined(VM_PROT_READ_IS_EXEC)
1281 if (prot & VM_PROT_READ)
1282 prot |= VM_PROT_EXECUTE;
1283
1284 if (maxprot & VM_PROT_READ)
1285 maxprot |= VM_PROT_EXECUTE;
1286 #endif
1287
1288 if (fitit)
1289 *addr = pmap_addr_hint(object, *addr, size);
1290
1291 if (flags & MAP_STACK)
1292 rv = vm_map_stack(map, *addr, size, prot, maxprot,
1293 docow | MAP_STACK_GROWS_DOWN);
1294 else
1295 rv = vm_map_find(map, object, foff, addr, size, fitit,
1296 prot, maxprot, docow);
1297
1298 if (rv != KERN_SUCCESS) {
1299 /*
1300 * Lose the object reference. Will destroy the
1301 * object if it's an unnamed anonymous mapping
1302 * or named anonymous without other references.
1303 */
1304 vm_object_deallocate(object);
1305 } else if (flags & MAP_SHARED) {
1306 /*
1307 * Shared memory is also shared with children.
1308 */
1309 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
1310 if (rv != KERN_SUCCESS)
1311 (void) vm_map_remove(map, *addr, *addr + size);
1312 }
1313
1314 /*
1315 * If the process has requested that all future mappings
1316 * be wired, then heed this.
1317 */
1318 if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
1319 vm_map_wire(map, *addr, *addr + size,
1320 VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
1321
1322 switch (rv) {
1323 case KERN_SUCCESS:
1324 return (0);
1325 case KERN_INVALID_ADDRESS:
1326 case KERN_NO_SPACE:
1327 return (ENOMEM);
1328 case KERN_PROTECTION_FAILURE:
1329 return (EACCES);
1330 default:
1331 return (EINVAL);
1332 }
1333 }
Cache object: b26308615d544f6ca2a0a5fe486824d7
|