FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_mmap.c
1 /* $OpenBSD: uvm_mmap.c,v 1.177 2023/01/16 07:09:11 guenther Exp $ */
2 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */
3
4 /*
5 * Copyright (c) 1997 Charles D. Cranor and Washington University.
6 * Copyright (c) 1991, 1993 The Regents of the University of California.
7 * Copyright (c) 1988 University of Utah.
8 *
9 * All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * the Systems Programming Group of the University of Utah Computer
13 * Science Department.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
40 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
41 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
42 */
43
44 /*
45 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
46 * function.
47 */
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/filedesc.h>
53 #include <sys/resourcevar.h>
54 #include <sys/mman.h>
55 #include <sys/mount.h>
56 #include <sys/proc.h>
57 #include <sys/malloc.h>
58 #include <sys/vnode.h>
59 #include <sys/conf.h>
60 #include <sys/signalvar.h>
61 #include <sys/syslog.h>
62 #include <sys/stat.h>
63 #include <sys/specdev.h>
64 #include <sys/stdint.h>
65 #include <sys/pledge.h>
66 #include <sys/unistd.h> /* for KBIND* */
67 #include <sys/user.h>
68
69 #include <machine/exec.h> /* for __LDPGSZ */
70
71 #include <sys/syscallargs.h>
72
73 #include <uvm/uvm.h>
74 #include <uvm/uvm_device.h>
75 #include <uvm/uvm_vnode.h>
76
77 int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
78 vsize_t, struct proc *);
79 int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int,
80 struct vnode *, voff_t, vsize_t, struct proc *);
81
82
83 /*
84 * Page align addr and size, returning EINVAL on wraparound.
85 */
86 #define ALIGN_ADDR(addr, size, pageoff) do { \
87 pageoff = (addr & PAGE_MASK); \
88 if (pageoff != 0) { \
89 if (size > SIZE_MAX - pageoff) \
90 return EINVAL; /* wraparound */ \
91 addr -= pageoff; \
92 size += pageoff; \
93 } \
94 if (size != 0) { \
95 size = (vsize_t)round_page(size); \
96 if (size == 0) \
97 return EINVAL; /* wraparound */ \
98 } \
99 } while (0)
100
101 /*
102 * sys_mquery: provide mapping hints to applications that do fixed mappings
103 *
104 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and
105 * don't care about PMAP_PREFER or such)
106 * addr: hint where we'd like to place the mapping.
107 * size: size of the mapping
108 * fd: fd of the file we want to map
109 * off: offset within the file
110 */
111 int
112 sys_mquery(struct proc *p, void *v, register_t *retval)
113 {
114 struct sys_mquery_args /* {
115 syscallarg(void *) addr;
116 syscallarg(size_t) len;
117 syscallarg(int) prot;
118 syscallarg(int) flags;
119 syscallarg(int) fd;
120 syscallarg(off_t) pos;
121 } */ *uap = v;
122 struct file *fp;
123 voff_t uoff;
124 int error;
125 vaddr_t vaddr;
126 int flags = 0;
127 vsize_t size;
128 vm_prot_t prot;
129 int fd;
130
131 vaddr = (vaddr_t) SCARG(uap, addr);
132 prot = SCARG(uap, prot);
133 size = (vsize_t) SCARG(uap, len);
134 fd = SCARG(uap, fd);
135
136 if ((prot & PROT_MASK) != prot)
137 return EINVAL;
138
139 if (SCARG(uap, flags) & MAP_FIXED)
140 flags |= UVM_FLAG_FIXED;
141
142 if (fd >= 0) {
143 if ((error = getvnode(p, fd, &fp)) != 0)
144 return error;
145 uoff = SCARG(uap, pos);
146 } else {
147 fp = NULL;
148 uoff = UVM_UNKNOWN_OFFSET;
149 }
150
151 if (vaddr == 0)
152 vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS,
153 VM_MAXUSER_ADDRESS);
154
155 error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff,
156 flags);
157 if (error == 0)
158 *retval = (register_t)(vaddr);
159
160 if (fp != NULL)
161 FRELE(fp, p);
162 return error;
163 }
164
165 int uvm_wxabort;
166
167 /*
168 * W^X violations are only allowed on permitted filesystems.
169 */
170 static inline int
171 uvm_wxcheck(struct proc *p, char *call)
172 {
173 struct process *pr = p->p_p;
174 int wxallowed = (pr->ps_textvp->v_mount &&
175 (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED));
176
177 if (wxallowed && (pr->ps_flags & PS_WXNEEDED))
178 return 0;
179
180 if (uvm_wxabort) {
181 KERNEL_LOCK();
182 /* Report W^X failures */
183 if (pr->ps_wxcounter++ == 0)
184 log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
185 pr->ps_comm, pr->ps_pid, call);
186 /* Send uncatchable SIGABRT for coredump */
187 sigexit(p, SIGABRT);
188 KERNEL_UNLOCK();
189 }
190
191 return ENOTSUP;
192 }
193
194 /*
195 * sys_mmap: mmap system call.
196 *
197 * => file offset and address may not be page aligned
198 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
199 * - if address isn't page aligned the mapping starts at trunc_page(addr)
200 * and the return value is adjusted up by the page offset.
201 */
202 int
203 sys_mmap(struct proc *p, void *v, register_t *retval)
204 {
205 struct sys_mmap_args /* {
206 syscallarg(void *) addr;
207 syscallarg(size_t) len;
208 syscallarg(int) prot;
209 syscallarg(int) flags;
210 syscallarg(int) fd;
211 syscallarg(off_t) pos;
212 } */ *uap = v;
213 vaddr_t addr;
214 struct vattr va;
215 off_t pos;
216 vsize_t limit, pageoff, size;
217 vm_prot_t prot, maxprot;
218 int flags, fd;
219 vaddr_t vm_min_address = VM_MIN_ADDRESS;
220 struct filedesc *fdp = p->p_fd;
221 struct file *fp = NULL;
222 struct vnode *vp;
223 int error;
224
225 /* first, extract syscall args from the uap. */
226 addr = (vaddr_t) SCARG(uap, addr);
227 size = (vsize_t) SCARG(uap, len);
228 prot = SCARG(uap, prot);
229 flags = SCARG(uap, flags);
230 fd = SCARG(uap, fd);
231 pos = SCARG(uap, pos);
232
233 /*
234 * Validate the flags.
235 */
236 if ((prot & PROT_MASK) != prot)
237 return EINVAL;
238 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
239 (error = uvm_wxcheck(p, "mmap")))
240 return error;
241
242 if ((flags & MAP_FLAGMASK) != flags)
243 return EINVAL;
244 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
245 return EINVAL;
246 if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE)
247 return EINVAL;
248 if (flags & MAP_STACK) {
249 if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE))
250 return EINVAL;
251 if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE))
252 return EINVAL;
253 if (pos != 0)
254 return EINVAL;
255 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
256 return EINVAL;
257 }
258 if (size == 0)
259 return EINVAL;
260
261 error = pledge_protexec(p, prot);
262 if (error)
263 return error;
264
265 /* align file position and save offset. adjust size. */
266 ALIGN_ADDR(pos, size, pageoff);
267
268 /* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */
269 if (flags & MAP_FIXED) {
270 /* adjust address by the same amount as we did the offset */
271 addr -= pageoff;
272 if (addr & PAGE_MASK)
273 return EINVAL; /* not page aligned */
274
275 if (addr > SIZE_MAX - size)
276 return EINVAL; /* no wrapping! */
277 if (VM_MAXUSER_ADDRESS > 0 &&
278 (addr + size) > VM_MAXUSER_ADDRESS)
279 return EINVAL;
280 if (vm_min_address > 0 && addr < vm_min_address)
281 return EINVAL;
282 }
283
284 /* check for file mappings (i.e. not anonymous) and verify file. */
285 if ((flags & MAP_ANON) == 0) {
286 KERNEL_LOCK();
287 if ((fp = fd_getfile(fdp, fd)) == NULL) {
288 error = EBADF;
289 goto out;
290 }
291
292 if (fp->f_type != DTYPE_VNODE) {
293 error = ENODEV; /* only mmap vnodes! */
294 goto out;
295 }
296 vp = (struct vnode *)fp->f_data; /* convert to vnode */
297
298 if (vp->v_type != VREG && vp->v_type != VCHR &&
299 vp->v_type != VBLK) {
300 error = ENODEV; /* only REG/CHR/BLK support mmap */
301 goto out;
302 }
303
304 if (vp->v_type == VREG && (pos + size) < pos) {
305 error = EINVAL; /* no offset wrapping */
306 goto out;
307 }
308
309 /* special case: catch SunOS style /dev/zero */
310 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
311 flags |= MAP_ANON;
312 FRELE(fp, p);
313 fp = NULL;
314 KERNEL_UNLOCK();
315 goto is_anon;
316 }
317
318 /*
319 * Old programs may not select a specific sharing type, so
320 * default to an appropriate one.
321 */
322 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
323 #if defined(DEBUG)
324 printf("WARNING: defaulted mmap() share type to"
325 " %s (pid %d comm %s)\n",
326 vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE",
327 p->p_p->ps_pid, p->p_p->ps_comm);
328 #endif
329 if (vp->v_type == VCHR)
330 flags |= MAP_SHARED; /* for a device */
331 else
332 flags |= MAP_PRIVATE; /* for a file */
333 }
334
335 /*
336 * MAP_PRIVATE device mappings don't make sense (and aren't
337 * supported anyway). However, some programs rely on this,
338 * so just change it to MAP_SHARED.
339 */
340 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
341 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
342 }
343
344 /* now check protection */
345 maxprot = PROT_EXEC;
346
347 /* check read access */
348 if (fp->f_flag & FREAD)
349 maxprot |= PROT_READ;
350 else if (prot & PROT_READ) {
351 error = EACCES;
352 goto out;
353 }
354
355 /* check write access, shared case first */
356 if (flags & MAP_SHARED) {
357 /*
358 * if the file is writable, only add PROT_WRITE to
359 * maxprot if the file is not immutable, append-only.
360 * otherwise, if we have asked for PROT_WRITE, return
361 * EPERM.
362 */
363 if (fp->f_flag & FWRITE) {
364 error = VOP_GETATTR(vp, &va, p->p_ucred, p);
365 if (error)
366 goto out;
367 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0)
368 maxprot |= PROT_WRITE;
369 else if (prot & PROT_WRITE) {
370 error = EPERM;
371 goto out;
372 }
373 } else if (prot & PROT_WRITE) {
374 error = EACCES;
375 goto out;
376 }
377 } else {
378 /* MAP_PRIVATE mappings can always write to */
379 maxprot |= PROT_WRITE;
380 }
381 if ((flags & __MAP_NOFAULT) != 0 ||
382 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
383 limit = lim_cur(RLIMIT_DATA);
384 if (limit < size ||
385 limit - size < ptoa(p->p_vmspace->vm_dused)) {
386 error = ENOMEM;
387 goto out;
388 }
389 }
390 error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot,
391 maxprot, flags, vp, pos, lim_cur(RLIMIT_MEMLOCK), p);
392 FRELE(fp, p);
393 KERNEL_UNLOCK();
394 } else { /* MAP_ANON case */
395 if (fd != -1)
396 return EINVAL;
397
398 is_anon: /* label for SunOS style /dev/zero */
399
400 /* __MAP_NOFAULT only makes sense with a backing object */
401 if ((flags & __MAP_NOFAULT) != 0)
402 return EINVAL;
403
404 if (prot != PROT_NONE || (flags & MAP_SHARED)) {
405 limit = lim_cur(RLIMIT_DATA);
406 if (limit < size ||
407 limit - size < ptoa(p->p_vmspace->vm_dused)) {
408 return ENOMEM;
409 }
410 }
411
412 /*
413 * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as
414 * MAP_PRIVATE, so make that clear.
415 */
416 if ((flags & MAP_SHARED) == 0)
417 flags |= MAP_PRIVATE;
418
419 maxprot = PROT_MASK;
420 error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot,
421 maxprot, flags, lim_cur(RLIMIT_MEMLOCK), p);
422 }
423
424 if (error == 0)
425 /* remember to add offset */
426 *retval = (register_t)(addr + pageoff);
427
428 return error;
429
430 out:
431 KERNEL_UNLOCK();
432 if (fp)
433 FRELE(fp, p);
434 return error;
435 }
436
437 #if 1
438 int
439 sys_pad_mquery(struct proc *p, void *v, register_t *retval)
440 {
441 struct sys_pad_mquery_args *uap = v;
442 struct sys_mquery_args unpad;
443
444 SCARG(&unpad, addr) = SCARG(uap, addr);
445 SCARG(&unpad, len) = SCARG(uap, len);
446 SCARG(&unpad, prot) = SCARG(uap, prot);
447 SCARG(&unpad, flags) = SCARG(uap, flags);
448 SCARG(&unpad, fd) = SCARG(uap, fd);
449 SCARG(&unpad, pos) = SCARG(uap, pos);
450 return sys_mquery(p, &unpad, retval);
451 }
452
453 int
454 sys_pad_mmap(struct proc *p, void *v, register_t *retval)
455 {
456 struct sys_pad_mmap_args *uap = v;
457 struct sys_mmap_args unpad;
458
459 SCARG(&unpad, addr) = SCARG(uap, addr);
460 SCARG(&unpad, len) = SCARG(uap, len);
461 SCARG(&unpad, prot) = SCARG(uap, prot);
462 SCARG(&unpad, flags) = SCARG(uap, flags);
463 SCARG(&unpad, fd) = SCARG(uap, fd);
464 SCARG(&unpad, pos) = SCARG(uap, pos);
465 return sys_mmap(p, &unpad, retval);
466 }
467 #endif
468
469 /*
470 * sys_msync: the msync system call (a front-end for flush)
471 */
472
473 int
474 sys_msync(struct proc *p, void *v, register_t *retval)
475 {
476 struct sys_msync_args /* {
477 syscallarg(void *) addr;
478 syscallarg(size_t) len;
479 syscallarg(int) flags;
480 } */ *uap = v;
481 vaddr_t addr;
482 vsize_t size, pageoff;
483 vm_map_t map;
484 int flags, uvmflags;
485
486 /* extract syscall args from the uap */
487 addr = (vaddr_t)SCARG(uap, addr);
488 size = (vsize_t)SCARG(uap, len);
489 flags = SCARG(uap, flags);
490
491 /* sanity check flags */
492 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
493 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
494 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
495 return EINVAL;
496 if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
497 flags |= MS_SYNC;
498
499 /* align the address to a page boundary, and adjust the size accordingly */
500 ALIGN_ADDR(addr, size, pageoff);
501 if (addr > SIZE_MAX - size)
502 return EINVAL; /* disallow wrap-around. */
503
504 /* get map */
505 map = &p->p_vmspace->vm_map;
506
507 /* translate MS_ flags into PGO_ flags */
508 uvmflags = PGO_CLEANIT;
509 if (flags & MS_INVALIDATE)
510 uvmflags |= PGO_FREE;
511 if (flags & MS_SYNC)
512 uvmflags |= PGO_SYNCIO;
513 else
514 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */
515
516 return uvm_map_clean(map, addr, addr+size, uvmflags);
517 }
518
519 /*
520 * sys_munmap: unmap a users memory
521 */
522 int
523 sys_munmap(struct proc *p, void *v, register_t *retval)
524 {
525 struct sys_munmap_args /* {
526 syscallarg(void *) addr;
527 syscallarg(size_t) len;
528 } */ *uap = v;
529 vaddr_t addr;
530 vsize_t size, pageoff;
531 vm_map_t map;
532 vaddr_t vm_min_address = VM_MIN_ADDRESS;
533 struct uvm_map_deadq dead_entries;
534
535 /* get syscall args... */
536 addr = (vaddr_t) SCARG(uap, addr);
537 size = (vsize_t) SCARG(uap, len);
538
539 /* align address to a page boundary, and adjust size accordingly */
540 ALIGN_ADDR(addr, size, pageoff);
541
542 /*
543 * Check for illegal addresses. Watch out for address wrap...
544 * Note that VM_*_ADDRESS are not constants due to casts (argh).
545 */
546 if (addr > SIZE_MAX - size)
547 return EINVAL;
548 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
549 return EINVAL;
550 if (vm_min_address > 0 && addr < vm_min_address)
551 return EINVAL;
552 map = &p->p_vmspace->vm_map;
553
554
555 vm_map_lock(map); /* lock map so we can checkprot */
556
557 /*
558 * interesting system call semantic: make sure entire range is
559 * allocated before allowing an unmap.
560 */
561 if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) {
562 vm_map_unlock(map);
563 return EINVAL;
564 }
565
566 TAILQ_INIT(&dead_entries);
567 if (uvm_unmap_remove(map, addr, addr + size, &dead_entries,
568 FALSE, TRUE, TRUE) != 0) {
569 vm_map_unlock(map);
570 return EPERM; /* immutable entries found */
571 }
572 vm_map_unlock(map); /* and unlock */
573
574 uvm_unmap_detach(&dead_entries, 0);
575
576 return 0;
577 }
578
579 /*
580 * sys_mprotect: the mprotect system call
581 */
582 int
583 sys_mprotect(struct proc *p, void *v, register_t *retval)
584 {
585 struct sys_mprotect_args /* {
586 syscallarg(void *) addr;
587 syscallarg(size_t) len;
588 syscallarg(int) prot;
589 } */ *uap = v;
590 vaddr_t addr;
591 vsize_t size, pageoff;
592 vm_prot_t prot;
593 int error;
594
595 /*
596 * extract syscall args from uap
597 */
598
599 addr = (vaddr_t)SCARG(uap, addr);
600 size = (vsize_t)SCARG(uap, len);
601 prot = SCARG(uap, prot);
602
603 if ((prot & PROT_MASK) != prot)
604 return EINVAL;
605 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) &&
606 (error = uvm_wxcheck(p, "mprotect")))
607 return error;
608
609 error = pledge_protexec(p, prot);
610 if (error)
611 return error;
612
613 /*
614 * align the address to a page boundary, and adjust the size accordingly
615 */
616 ALIGN_ADDR(addr, size, pageoff);
617 if (addr > SIZE_MAX - size)
618 return EINVAL; /* disallow wrap-around. */
619
620 return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size,
621 prot, 0, FALSE, TRUE));
622 }
623
624 /*
625 * sys_msyscall: the msyscall system call
626 */
627 int
628 sys_msyscall(struct proc *p, void *v, register_t *retval)
629 {
630 struct sys_msyscall_args /* {
631 syscallarg(void *) addr;
632 syscallarg(size_t) len;
633 } */ *uap = v;
634 vaddr_t addr;
635 vsize_t size, pageoff;
636
637 addr = (vaddr_t)SCARG(uap, addr);
638 size = (vsize_t)SCARG(uap, len);
639
640 /*
641 * align the address to a page boundary, and adjust the size accordingly
642 */
643 ALIGN_ADDR(addr, size, pageoff);
644 if (addr > SIZE_MAX - size)
645 return EINVAL; /* disallow wrap-around. */
646
647 return uvm_map_syscall(&p->p_vmspace->vm_map, addr, addr+size);
648 }
649
650 /*
651 * sys_mimmutable: the mimmutable system call
652 */
653 int
654 sys_mimmutable(struct proc *p, void *v, register_t *retval)
655 {
656 struct sys_mimmutable_args /* {
657 immutablearg(void *) addr;
658 immutablearg(size_t) len;
659 } */ *uap = v;
660 vaddr_t addr;
661 vsize_t size, pageoff;
662
663 addr = (vaddr_t)SCARG(uap, addr);
664 size = (vsize_t)SCARG(uap, len);
665
666 /*
667 * align the address to a page boundary, and adjust the size accordingly
668 */
669 ALIGN_ADDR(addr, size, pageoff);
670 if (addr > SIZE_MAX - size)
671 return EINVAL; /* disallow wrap-around. */
672
673 return uvm_map_immutable(&p->p_vmspace->vm_map, addr, addr+size, 1);
674 }
675
676 /*
677 * sys_minherit: the minherit system call
678 */
679 int
680 sys_minherit(struct proc *p, void *v, register_t *retval)
681 {
682 struct sys_minherit_args /* {
683 syscallarg(void *) addr;
684 syscallarg(size_t) len;
685 syscallarg(int) inherit;
686 } */ *uap = v;
687 vaddr_t addr;
688 vsize_t size, pageoff;
689 vm_inherit_t inherit;
690
691 addr = (vaddr_t)SCARG(uap, addr);
692 size = (vsize_t)SCARG(uap, len);
693 inherit = SCARG(uap, inherit);
694
695 /*
696 * align the address to a page boundary, and adjust the size accordingly
697 */
698 ALIGN_ADDR(addr, size, pageoff);
699 if (addr > SIZE_MAX - size)
700 return EINVAL; /* disallow wrap-around. */
701
702 return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size,
703 inherit));
704 }
705
706 /*
707 * sys_madvise: give advice about memory usage.
708 */
709 /* ARGSUSED */
710 int
711 sys_madvise(struct proc *p, void *v, register_t *retval)
712 {
713 struct sys_madvise_args /* {
714 syscallarg(void *) addr;
715 syscallarg(size_t) len;
716 syscallarg(int) behav;
717 } */ *uap = v;
718 vaddr_t addr;
719 vsize_t size, pageoff;
720 int advice, error;
721
722 addr = (vaddr_t)SCARG(uap, addr);
723 size = (vsize_t)SCARG(uap, len);
724 advice = SCARG(uap, behav);
725
726 /*
727 * align the address to a page boundary, and adjust the size accordingly
728 */
729 ALIGN_ADDR(addr, size, pageoff);
730 if (addr > SIZE_MAX - size)
731 return EINVAL; /* disallow wrap-around. */
732
733 switch (advice) {
734 case MADV_NORMAL:
735 case MADV_RANDOM:
736 case MADV_SEQUENTIAL:
737 error = uvm_map_advice(&p->p_vmspace->vm_map, addr,
738 addr + size, advice);
739 break;
740
741 case MADV_WILLNEED:
742 /*
743 * Activate all these pages, pre-faulting them in if
744 * necessary.
745 */
746 /*
747 * XXX IMPLEMENT ME.
748 * Should invent a "weak" mode for uvm_fault()
749 * which would only do the PGO_LOCKED pgo_get().
750 */
751 return 0;
752
753 case MADV_DONTNEED:
754 /*
755 * Deactivate all these pages. We don't need them
756 * any more. We don't, however, toss the data in
757 * the pages.
758 */
759 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
760 PGO_DEACTIVATE);
761 break;
762
763 case MADV_FREE:
764 /*
765 * These pages contain no valid data, and may be
766 * garbage-collected. Toss all resources, including
767 * any swap space in use.
768 */
769 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
770 PGO_FREE);
771 break;
772
773 case MADV_SPACEAVAIL:
774 /*
775 * XXXMRG What is this? I think it's:
776 *
777 * Ensure that we have allocated backing-store
778 * for these pages.
779 *
780 * This is going to require changes to the page daemon,
781 * as it will free swap space allocated to pages in core.
782 * There's also what to do for device/file/anonymous memory.
783 */
784 return EINVAL;
785
786 default:
787 return EINVAL;
788 }
789
790 return error;
791 }
792
793 /*
794 * sys_mlock: memory lock
795 */
796
797 int
798 sys_mlock(struct proc *p, void *v, register_t *retval)
799 {
800 struct sys_mlock_args /* {
801 syscallarg(const void *) addr;
802 syscallarg(size_t) len;
803 } */ *uap = v;
804 vaddr_t addr;
805 vsize_t size, pageoff;
806 int error;
807
808 /* extract syscall args from uap */
809 addr = (vaddr_t)SCARG(uap, addr);
810 size = (vsize_t)SCARG(uap, len);
811
812 /* align address to a page boundary and adjust size accordingly */
813 ALIGN_ADDR(addr, size, pageoff);
814 if (addr > SIZE_MAX - size)
815 return EINVAL; /* disallow wrap-around. */
816
817 if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
818 return EAGAIN;
819
820 #ifdef pmap_wired_count
821 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
822 lim_cur(RLIMIT_MEMLOCK))
823 return EAGAIN;
824 #else
825 if ((error = suser(p)) != 0)
826 return error;
827 #endif
828
829 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
830 0);
831 return error == 0 ? 0 : ENOMEM;
832 }
833
834 /*
835 * sys_munlock: unlock wired pages
836 */
837
838 int
839 sys_munlock(struct proc *p, void *v, register_t *retval)
840 {
841 struct sys_munlock_args /* {
842 syscallarg(const void *) addr;
843 syscallarg(size_t) len;
844 } */ *uap = v;
845 vaddr_t addr;
846 vsize_t size, pageoff;
847 int error;
848
849 /* extract syscall args from uap */
850 addr = (vaddr_t)SCARG(uap, addr);
851 size = (vsize_t)SCARG(uap, len);
852
853 /* align address to a page boundary, and adjust size accordingly */
854 ALIGN_ADDR(addr, size, pageoff);
855 if (addr > SIZE_MAX - size)
856 return EINVAL; /* disallow wrap-around. */
857
858 #ifndef pmap_wired_count
859 if ((error = suser(p)) != 0)
860 return error;
861 #endif
862
863 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
864 0);
865 return error == 0 ? 0 : ENOMEM;
866 }
867
868 /*
869 * sys_mlockall: lock all pages mapped into an address space.
870 */
871 int
872 sys_mlockall(struct proc *p, void *v, register_t *retval)
873 {
874 struct sys_mlockall_args /* {
875 syscallarg(int) flags;
876 } */ *uap = v;
877 int error, flags;
878
879 flags = SCARG(uap, flags);
880
881 if (flags == 0 ||
882 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
883 return EINVAL;
884
885 #ifndef pmap_wired_count
886 if ((error = suser(p)) != 0)
887 return error;
888 #endif
889
890 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
891 lim_cur(RLIMIT_MEMLOCK));
892 if (error != 0 && error != ENOMEM)
893 return EAGAIN;
894 return error;
895 }
896
897 /*
898 * sys_munlockall: unlock all pages mapped into an address space.
899 */
900 int
901 sys_munlockall(struct proc *p, void *v, register_t *retval)
902 {
903
904 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
905 return 0;
906 }
907
908 /*
909 * common code for mmapanon and mmapfile to lock a mmaping
910 */
911 int
912 uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
913 vsize_t locklimit)
914 {
915 int error;
916
917 /*
918 * POSIX 1003.1b -- if our address space was configured
919 * to lock all future mappings, wire the one we just made.
920 */
921 if (prot == PROT_NONE) {
922 /*
923 * No more work to do in this case.
924 */
925 return 0;
926 }
927
928 vm_map_lock(map);
929 if (map->flags & VM_MAP_WIREFUTURE) {
930 KERNEL_LOCK();
931 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax
932 #ifdef pmap_wired_count
933 || (locklimit != 0 && (size +
934 ptoa(pmap_wired_count(vm_map_pmap(map)))) >
935 locklimit)
936 #endif
937 ) {
938 error = ENOMEM;
939 vm_map_unlock(map);
940 /* unmap the region! */
941 uvm_unmap(map, *addr, *addr + size);
942 KERNEL_UNLOCK();
943 return error;
944 }
945 /*
946 * uvm_map_pageable() always returns the map
947 * unlocked.
948 */
949 error = uvm_map_pageable(map, *addr, *addr + size,
950 FALSE, UVM_LK_ENTER);
951 if (error != 0) {
952 /* unmap the region! */
953 uvm_unmap(map, *addr, *addr + size);
954 KERNEL_UNLOCK();
955 return error;
956 }
957 KERNEL_UNLOCK();
958 return 0;
959 }
960 vm_map_unlock(map);
961 return 0;
962 }
963
964 /*
965 * uvm_mmapanon: internal version of mmap for anons
966 *
967 * - used by sys_mmap
968 */
969 int
970 uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
971 vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p)
972 {
973 int error;
974 int advice = MADV_NORMAL;
975 unsigned int uvmflag = 0;
976 vsize_t align = 0; /* userland page size */
977
978 /*
979 * for non-fixed mappings, round off the suggested address.
980 * for fixed mappings, check alignment and zap old mappings.
981 */
982 if ((flags & MAP_FIXED) == 0) {
983 *addr = round_page(*addr); /* round */
984 } else {
985 if (*addr & PAGE_MASK)
986 return EINVAL;
987
988 uvmflag |= UVM_FLAG_FIXED;
989 if ((flags & __MAP_NOREPLACE) == 0)
990 uvmflag |= UVM_FLAG_UNMAP;
991 }
992
993 if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ)
994 align = __LDPGSZ;
995 if ((flags & MAP_SHARED) == 0)
996 /* XXX: defer amap create */
997 uvmflag |= UVM_FLAG_COPYONW;
998 else
999 /* shared: create amap now */
1000 uvmflag |= UVM_FLAG_OVERLAY;
1001 if (flags & MAP_STACK)
1002 uvmflag |= UVM_FLAG_STACK;
1003 if (flags & MAP_CONCEAL)
1004 uvmflag |= UVM_FLAG_CONCEAL;
1005
1006 /* set up mapping flags */
1007 uvmflag = UVM_MAPFLAG(prot, maxprot,
1008 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1009 advice, uvmflag);
1010
1011 error = uvm_mapanon(map, addr, size, align, uvmflag);
1012
1013 if (error == 0)
1014 error = uvm_mmaplock(map, addr, size, prot, locklimit);
1015 return error;
1016 }
1017
1018 /*
1019 * uvm_mmapfile: internal version of mmap for non-anons
1020 *
1021 * - used by sys_mmap
1022 * - caller must page-align the file offset
1023 */
1024 int
1025 uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
1026 vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff,
1027 vsize_t locklimit, struct proc *p)
1028 {
1029 struct uvm_object *uobj;
1030 int error;
1031 int advice = MADV_NORMAL;
1032 unsigned int uvmflag = 0;
1033 vsize_t align = 0; /* userland page size */
1034
1035 /*
1036 * for non-fixed mappings, round off the suggested address.
1037 * for fixed mappings, check alignment and zap old mappings.
1038 */
1039 if ((flags & MAP_FIXED) == 0) {
1040 *addr = round_page(*addr); /* round */
1041 } else {
1042 if (*addr & PAGE_MASK)
1043 return EINVAL;
1044
1045 uvmflag |= UVM_FLAG_FIXED;
1046 if ((flags & __MAP_NOREPLACE) == 0)
1047 uvmflag |= UVM_FLAG_UNMAP;
1048 }
1049
1050 /*
1051 * attach to underlying vm object.
1052 */
1053 if (vp->v_type != VCHR) {
1054 uobj = uvn_attach(vp, (flags & MAP_SHARED) ?
1055 maxprot : (maxprot & ~PROT_WRITE));
1056
1057 /*
1058 * XXXCDC: hack from old code
1059 * don't allow vnodes which have been mapped
1060 * shared-writeable to persist [forces them to be
1061 * flushed out when last reference goes].
1062 * XXXCDC: interesting side effect: avoids a bug.
1063 * note that in WRITE [ufs_readwrite.c] that we
1064 * allocate buffer, uncache, and then do the write.
1065 * the problem with this is that if the uncache causes
1066 * VM data to be flushed to the same area of the file
1067 * we are writing to... in that case we've got the
1068 * buffer locked and our process goes to sleep forever.
1069 *
1070 * XXXCDC: checking maxprot protects us from the
1071 * "persistbug" program but this is not a long term
1072 * solution.
1073 *
1074 * XXXCDC: we don't bother calling uncache with the vp
1075 * VOP_LOCKed since we know that we are already
1076 * holding a valid reference to the uvn (from the
1077 * uvn_attach above), and thus it is impossible for
1078 * the uncache to kill the uvn and trigger I/O.
1079 */
1080 if (flags & MAP_SHARED) {
1081 if ((prot & PROT_WRITE) ||
1082 (maxprot & PROT_WRITE)) {
1083 uvm_vnp_uncache(vp);
1084 }
1085 }
1086 } else {
1087 uobj = udv_attach(vp->v_rdev,
1088 (flags & MAP_SHARED) ? maxprot :
1089 (maxprot & ~PROT_WRITE), foff, size);
1090 /*
1091 * XXX Some devices don't like to be mapped with
1092 * XXX PROT_EXEC, but we don't really have a
1093 * XXX better way of handling this, right now
1094 */
1095 if (uobj == NULL && (prot & PROT_EXEC) == 0) {
1096 maxprot &= ~PROT_EXEC;
1097 uobj = udv_attach(vp->v_rdev,
1098 (flags & MAP_SHARED) ? maxprot :
1099 (maxprot & ~PROT_WRITE), foff, size);
1100 }
1101 advice = MADV_RANDOM;
1102 }
1103
1104 if (uobj == NULL)
1105 return vp->v_type == VREG ? ENOMEM : EINVAL;
1106
1107 if ((flags & MAP_SHARED) == 0)
1108 uvmflag |= UVM_FLAG_COPYONW;
1109 if (flags & __MAP_NOFAULT)
1110 uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY);
1111 if (flags & MAP_STACK)
1112 uvmflag |= UVM_FLAG_STACK;
1113 if (flags & MAP_CONCEAL)
1114 uvmflag |= UVM_FLAG_CONCEAL;
1115
1116 /* set up mapping flags */
1117 uvmflag = UVM_MAPFLAG(prot, maxprot,
1118 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY,
1119 advice, uvmflag);
1120
1121 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1122
1123 if (error == 0)
1124 return uvm_mmaplock(map, addr, size, prot, locklimit);
1125
1126 /* errors: first detach from the uobj, if any. */
1127 if (uobj)
1128 uobj->pgops->pgo_detach(uobj);
1129
1130 return error;
1131 }
1132
1133 int
1134 sys_kbind(struct proc *p, void *v, register_t *retval)
1135 {
1136 struct sys_kbind_args /* {
1137 syscallarg(const struct __kbind *) param;
1138 syscallarg(size_t) psize;
1139 syscallarg(uint64_t) proc_cookie;
1140 } */ *uap = v;
1141 const struct __kbind *paramp;
1142 union {
1143 struct __kbind uk[KBIND_BLOCK_MAX];
1144 char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX];
1145 } param;
1146 struct uvm_map_deadq dead_entries;
1147 struct process *pr = p->p_p;
1148 const char *data;
1149 vaddr_t baseva, last_baseva, endva, pageoffset, kva;
1150 size_t psize, s;
1151 u_long pc;
1152 int count, i, extra;
1153 int error, sigill = 0;
1154
1155 /*
1156 * extract syscall args from uap
1157 */
1158 paramp = SCARG(uap, param);
1159 psize = SCARG(uap, psize);
1160
1161 /*
1162 * If paramp is NULL and we're uninitialized, disable the syscall
1163 * for the process. Raise SIGILL if paramp is NULL and we're
1164 * already initialized.
1165 *
1166 * If paramp is non-NULL and we're uninitialized, do initialization.
1167 * Otherwise, do security checks and raise SIGILL on failure.
1168 */
1169 pc = PROC_PC(p);
1170 mtx_enter(&pr->ps_mtx);
1171 if (paramp == NULL) {
1172 /* ld.so disables kbind() when lazy binding is disabled */
1173 if (pr->ps_kbind_addr == 0)
1174 pr->ps_kbind_addr = BOGO_PC;
1175 /* pre-7.3 static binaries disable kbind */
1176 /* XXX delete check in 2026 */
1177 else if (pr->ps_kbind_addr != BOGO_PC)
1178 sigill = 1;
1179 } else if (pr->ps_kbind_addr == 0) {
1180 pr->ps_kbind_addr = pc;
1181 pr->ps_kbind_cookie = SCARG(uap, proc_cookie);
1182 } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC ||
1183 pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) {
1184 sigill = 1;
1185 }
1186 mtx_leave(&pr->ps_mtx);
1187
1188 /* Raise SIGILL if something is off. */
1189 if (sigill) {
1190 KERNEL_LOCK();
1191 sigexit(p, SIGILL);
1192 /* NOTREACHED */
1193 KERNEL_UNLOCK();
1194 }
1195
1196 /* We're done if we were disabling the syscall. */
1197 if (paramp == NULL)
1198 return 0;
1199
1200 if (psize < sizeof(struct __kbind) || psize > sizeof(param))
1201 return EINVAL;
1202 if ((error = copyin(paramp, ¶m, psize)))
1203 return error;
1204
1205 /*
1206 * The param argument points to an array of __kbind structures
1207 * followed by the corresponding new data areas for them. Verify
1208 * that the sizes in the __kbind structures add up to the total
1209 * size and find the start of the new area.
1210 */
1211 paramp = ¶m.uk[0];
1212 s = psize;
1213 for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) {
1214 if (s < sizeof(*paramp))
1215 return EINVAL;
1216 s -= sizeof(*paramp);
1217
1218 baseva = (vaddr_t)paramp[count].kb_addr;
1219 endva = baseva + paramp[count].kb_size - 1;
1220 if (paramp[count].kb_addr == NULL ||
1221 paramp[count].kb_size == 0 ||
1222 paramp[count].kb_size > KBIND_DATA_MAX ||
1223 baseva >= VM_MAXUSER_ADDRESS ||
1224 endva >= VM_MAXUSER_ADDRESS ||
1225 s < paramp[count].kb_size)
1226 return EINVAL;
1227
1228 s -= paramp[count].kb_size;
1229 }
1230 if (s > 0)
1231 return EINVAL;
1232 data = (const char *)¶mp[count];
1233
1234 /* all looks good, so do the bindings */
1235 last_baseva = VM_MAXUSER_ADDRESS;
1236 kva = 0;
1237 TAILQ_INIT(&dead_entries);
1238 KERNEL_LOCK();
1239 for (i = 0; i < count; i++) {
1240 baseva = (vaddr_t)paramp[i].kb_addr;
1241 s = paramp[i].kb_size;
1242 pageoffset = baseva & PAGE_MASK;
1243 baseva = trunc_page(baseva);
1244
1245 /* hppa at least runs PLT entries over page edge */
1246 extra = (pageoffset + s) & PAGE_MASK;
1247 if (extra > pageoffset)
1248 extra = 0;
1249 else
1250 s -= extra;
1251 redo:
1252 /* make sure sure the desired page is mapped into kernel_map */
1253 if (baseva != last_baseva) {
1254 if (kva != 0) {
1255 vm_map_lock(kernel_map);
1256 uvm_unmap_remove(kernel_map, kva,
1257 kva+PAGE_SIZE, &dead_entries,
1258 FALSE, TRUE, FALSE); /* XXX */
1259 vm_map_unlock(kernel_map);
1260 kva = 0;
1261 }
1262 if ((error = uvm_map_extract(&p->p_vmspace->vm_map,
1263 baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT)))
1264 break;
1265 last_baseva = baseva;
1266 }
1267
1268 /* do the update */
1269 if ((error = kcopy(data, (char *)kva + pageoffset, s)))
1270 break;
1271 data += s;
1272
1273 if (extra > 0) {
1274 baseva += PAGE_SIZE;
1275 s = extra;
1276 pageoffset = 0;
1277 extra = 0;
1278 goto redo;
1279 }
1280 }
1281
1282 if (kva != 0) {
1283 vm_map_lock(kernel_map);
1284 uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE,
1285 &dead_entries, FALSE, TRUE, FALSE); /* XXX */
1286 vm_map_unlock(kernel_map);
1287 }
1288 uvm_unmap_detach(&dead_entries, AMAP_REFALL);
1289 KERNEL_UNLOCK();
1290
1291 return error;
1292 }
Cache object: 6773b6aa4342f66714b15b06cd3f478d
|