The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1988 University of Utah.
    5  * Copyright (c) 1991, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   37  *
   38  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   39  */
   40 
   41 /*
   42  * Mapped file (mmap) interface to VM
   43  */
   44 
   45 #include <sys/cdefs.h>
   46 __FBSDID("$FreeBSD$");
   47 
   48 #include "opt_hwpmc_hooks.h"
   49 #include "opt_vm.h"
   50 
   51 #include <sys/param.h>
   52 #include <sys/systm.h>
   53 #include <sys/capsicum.h>
   54 #include <sys/kernel.h>
   55 #include <sys/lock.h>
   56 #include <sys/mutex.h>
   57 #include <sys/sysproto.h>
   58 #include <sys/elf.h>
   59 #include <sys/filedesc.h>
   60 #include <sys/priv.h>
   61 #include <sys/proc.h>
   62 #include <sys/procctl.h>
   63 #include <sys/racct.h>
   64 #include <sys/resource.h>
   65 #include <sys/resourcevar.h>
   66 #include <sys/rwlock.h>
   67 #include <sys/sysctl.h>
   68 #include <sys/vnode.h>
   69 #include <sys/fcntl.h>
   70 #include <sys/file.h>
   71 #include <sys/mman.h>
   72 #include <sys/mount.h>
   73 #include <sys/conf.h>
   74 #include <sys/stat.h>
   75 #include <sys/syscallsubr.h>
   76 #include <sys/sysent.h>
   77 #include <sys/vmmeter.h>
   78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
   79 #include <machine/md_var.h>
   80 #endif
   81 
   82 #include <security/audit/audit.h>
   83 #include <security/mac/mac_framework.h>
   84 
   85 #include <vm/vm.h>
   86 #include <vm/vm_param.h>
   87 #include <vm/pmap.h>
   88 #include <vm/vm_map.h>
   89 #include <vm/vm_object.h>
   90 #include <vm/vm_page.h>
   91 #include <vm/vm_pager.h>
   92 #include <vm/vm_pageout.h>
   93 #include <vm/vm_extern.h>
   94 #include <vm/vm_page.h>
   95 #include <vm/vnode_pager.h>
   96 
   97 #ifdef HWPMC_HOOKS
   98 #include <sys/pmckern.h>
   99 #endif
  100 
  101 int old_mlock = 0;
  102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
  103     "Do not apply RLIMIT_MEMLOCK on mlockall");
  104 static int mincore_mapped = 1;
  105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
  106     "mincore reports mappings, not residency");
  107 static int imply_prot_max = 0;
  108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
  109     "Imply maximum page protections in mmap() when none are specified");
  110 
  111 #ifdef MAP_32BIT
  112 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
  113 #endif
  114 
  115 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
  116 
  117 #ifndef _SYS_SYSPROTO_H_
  118 struct sbrk_args {
  119         int incr;
  120 };
  121 #endif
  122 
  123 int
  124 sys_sbrk(struct thread *td, struct sbrk_args *uap)
  125 {
  126         /* Not yet implemented */
  127         return (EOPNOTSUPP);
  128 }
  129 
  130 #ifndef _SYS_SYSPROTO_H_
  131 struct sstk_args {
  132         int incr;
  133 };
  134 #endif
  135 
  136 int
  137 sys_sstk(struct thread *td, struct sstk_args *uap)
  138 {
  139         /* Not yet implemented */
  140         return (EOPNOTSUPP);
  141 }
  142 
  143 #if defined(COMPAT_43)
  144 int
  145 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
  146 {
  147 
  148         td->td_retval[0] = PAGE_SIZE;
  149         return (0);
  150 }
  151 #endif                          /* COMPAT_43 */
  152 
  153 /*
  154  * Memory Map (mmap) system call.  Note that the file offset
  155  * and address are allowed to be NOT page aligned, though if
  156  * the MAP_FIXED flag it set, both must have the same remainder
  157  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  158  * page-aligned, the actual mapping starts at trunc_page(addr)
  159  * and the return value is adjusted up by the page offset.
  160  *
  161  * Generally speaking, only character devices which are themselves
  162  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  163  * there would be no cache coherency between a descriptor and a VM mapping
  164  * both to the same character device.
  165  */
  166 #ifndef _SYS_SYSPROTO_H_
  167 struct mmap_args {
  168         void *addr;
  169         size_t len;
  170         int prot;
  171         int flags;
  172         int fd;
  173         long pad;
  174         off_t pos;
  175 };
  176 #endif
  177 
  178 int
  179 sys_mmap(struct thread *td, struct mmap_args *uap)
  180 {
  181 
  182         return (kern_mmap(td, &(struct mmap_req){
  183                 .mr_hint = (uintptr_t)uap->addr,
  184                 .mr_len = uap->len,
  185                 .mr_prot = uap->prot,
  186                 .mr_flags = uap->flags,
  187                 .mr_fd = uap->fd,
  188                 .mr_pos = uap->pos,
  189             }));
  190 }
  191 
  192 int
  193 kern_mmap_maxprot(struct proc *p, int prot)
  194 {
  195 
  196         if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
  197             (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
  198                 return (_PROT_ALL);
  199         if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
  200             prot != PROT_NONE)
  201                  return (prot);
  202         return (_PROT_ALL);
  203 }
  204 
  205 int
  206 kern_mmap(struct thread *td, const struct mmap_req *mrp)
  207 {
  208         struct vmspace *vms;
  209         struct file *fp;
  210         struct proc *p;
  211         off_t pos;
  212         vm_offset_t addr, orig_addr;
  213         vm_size_t len, pageoff, size;
  214         vm_prot_t cap_maxprot;
  215         int align, error, fd, flags, max_prot, prot;
  216         cap_rights_t rights;
  217         mmap_check_fp_fn check_fp_fn;
  218 
  219         orig_addr = addr = mrp->mr_hint;
  220         len = mrp->mr_len;
  221         prot = mrp->mr_prot;
  222         flags = mrp->mr_flags;
  223         fd = mrp->mr_fd;
  224         pos = mrp->mr_pos;
  225         check_fp_fn = mrp->mr_check_fp_fn;
  226 
  227         if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
  228                 return (EINVAL);
  229         max_prot = PROT_MAX_EXTRACT(prot);
  230         prot = PROT_EXTRACT(prot);
  231         if (max_prot != 0 && (max_prot & prot) != prot)
  232                 return (ENOTSUP);
  233 
  234         p = td->td_proc;
  235 
  236         /*
  237          * Always honor PROT_MAX if set.  If not, default to all
  238          * permissions unless we're implying maximum permissions.
  239          */
  240         if (max_prot == 0)
  241                 max_prot = kern_mmap_maxprot(p, prot);
  242 
  243         vms = p->p_vmspace;
  244         fp = NULL;
  245         AUDIT_ARG_FD(fd);
  246 
  247         /*
  248          * Ignore old flags that used to be defined but did not do anything.
  249          */
  250         flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
  251 
  252         /*
  253          * Enforce the constraints.
  254          * Mapping of length 0 is only allowed for old binaries.
  255          * Anonymous mapping shall specify -1 as filedescriptor and
  256          * zero position for new code. Be nice to ancient a.out
  257          * binaries and correct pos for anonymous mapping, since old
  258          * ld.so sometimes issues anonymous map requests with non-zero
  259          * pos.
  260          */
  261         if (!SV_CURPROC_FLAG(SV_AOUT)) {
  262                 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
  263                     ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
  264                         return (EINVAL);
  265         } else {
  266                 if ((flags & MAP_ANON) != 0)
  267                         pos = 0;
  268         }
  269 
  270         if (flags & MAP_STACK) {
  271                 if ((fd != -1) ||
  272                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  273                         return (EINVAL);
  274                 flags |= MAP_ANON;
  275                 pos = 0;
  276         }
  277         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
  278             MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
  279             MAP_PREFAULT_READ | MAP_GUARD |
  280 #ifdef MAP_32BIT
  281             MAP_32BIT |
  282 #endif
  283             MAP_ALIGNMENT_MASK)) != 0)
  284                 return (EINVAL);
  285         if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
  286                 return (EINVAL);
  287         if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
  288                 return (EINVAL);
  289         if (prot != PROT_NONE &&
  290             (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
  291                 return (EINVAL);
  292         if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
  293             pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
  294 #ifdef MAP_32BIT
  295             MAP_32BIT |
  296 #endif
  297             MAP_ALIGNMENT_MASK)) != 0))
  298                 return (EINVAL);
  299 
  300         /*
  301          * Align the file position to a page boundary,
  302          * and save its page offset component.
  303          */
  304         pageoff = (pos & PAGE_MASK);
  305         pos -= pageoff;
  306 
  307         /* Compute size from len by rounding (on both ends). */
  308         size = len + pageoff;                   /* low end... */
  309         size = round_page(size);                /* hi end */
  310         /* Check for rounding up to zero. */
  311         if (len > size)
  312                 return (ENOMEM);
  313 
  314         /* Ensure alignment is at least a page and fits in a pointer. */
  315         align = flags & MAP_ALIGNMENT_MASK;
  316         if (align != 0 && align != MAP_ALIGNED_SUPER &&
  317             (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
  318             align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
  319                 return (EINVAL);
  320 
  321         /*
  322          * Check for illegal addresses.  Watch out for address wrap... Note
  323          * that VM_*_ADDRESS are not constants due to casts (argh).
  324          */
  325         if (flags & MAP_FIXED) {
  326                 /*
  327                  * The specified address must have the same remainder
  328                  * as the file offset taken modulo PAGE_SIZE, so it
  329                  * should be aligned after adjustment by pageoff.
  330                  */
  331                 addr -= pageoff;
  332                 if (addr & PAGE_MASK)
  333                         return (EINVAL);
  334 
  335                 /* Address range must be all in user VM space. */
  336                 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size))
  337                         return (EINVAL);
  338 #ifdef MAP_32BIT
  339                 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
  340                         return (EINVAL);
  341         } else if (flags & MAP_32BIT) {
  342                 /*
  343                  * For MAP_32BIT, override the hint if it is too high and
  344                  * do not bother moving the mapping past the heap (since
  345                  * the heap is usually above 2GB).
  346                  */
  347                 if (addr + size > MAP_32BIT_MAX_ADDR)
  348                         addr = 0;
  349 #endif
  350         } else {
  351                 /*
  352                  * XXX for non-fixed mappings where no hint is provided or
  353                  * the hint would fall in the potential heap space,
  354                  * place it after the end of the largest possible heap.
  355                  *
  356                  * There should really be a pmap call to determine a reasonable
  357                  * location.
  358                  */
  359                 if (addr == 0 ||
  360                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  361                     addr < round_page((vm_offset_t)vms->vm_daddr +
  362                     lim_max(td, RLIMIT_DATA))))
  363                         addr = round_page((vm_offset_t)vms->vm_daddr +
  364                             lim_max(td, RLIMIT_DATA));
  365         }
  366         if (len == 0) {
  367                 /*
  368                  * Return success without mapping anything for old
  369                  * binaries that request a page-aligned mapping of
  370                  * length 0.  For modern binaries, this function
  371                  * returns an error earlier.
  372                  */
  373                 error = 0;
  374         } else if ((flags & MAP_GUARD) != 0) {
  375                 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
  376                     VM_PROT_NONE, flags, NULL, pos, FALSE, td);
  377         } else if ((flags & MAP_ANON) != 0) {
  378                 /*
  379                  * Mapping blank space is trivial.
  380                  *
  381                  * This relies on VM_PROT_* matching PROT_*.
  382                  */
  383                 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
  384                     max_prot, flags, NULL, pos, FALSE, td);
  385         } else {
  386                 /*
  387                  * Mapping file, get fp for validation and don't let the
  388                  * descriptor disappear on us if we block. Check capability
  389                  * rights, but also return the maximum rights to be combined
  390                  * with maxprot later.
  391                  */
  392                 cap_rights_init_one(&rights, CAP_MMAP);
  393                 if (prot & PROT_READ)
  394                         cap_rights_set_one(&rights, CAP_MMAP_R);
  395                 if ((flags & MAP_SHARED) != 0) {
  396                         if (prot & PROT_WRITE)
  397                                 cap_rights_set_one(&rights, CAP_MMAP_W);
  398                 }
  399                 if (prot & PROT_EXEC)
  400                         cap_rights_set_one(&rights, CAP_MMAP_X);
  401                 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
  402                 if (error != 0)
  403                         goto done;
  404                 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
  405                     p->p_osrel >= P_OSREL_MAP_FSTRICT) {
  406                         error = EINVAL;
  407                         goto done;
  408                 }
  409                 if (check_fp_fn != NULL) {
  410                         error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
  411                             flags);
  412                         if (error != 0)
  413                                 goto done;
  414                 }
  415                 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
  416                         addr = orig_addr;
  417                 /* This relies on VM_PROT_* matching PROT_*. */
  418                 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
  419                     max_prot & cap_maxprot, flags, pos, td);
  420         }
  421 
  422         if (error == 0)
  423                 td->td_retval[0] = addr + pageoff;
  424 done:
  425         if (fp)
  426                 fdrop(fp, td);
  427 
  428         return (error);
  429 }
  430 
  431 #if defined(COMPAT_FREEBSD6)
  432 int
  433 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
  434 {
  435         return (kern_mmap(td, &(struct mmap_req){
  436                 .mr_hint = (uintptr_t)uap->addr,
  437                 .mr_len = uap->len,
  438                 .mr_prot = uap->prot,
  439                 .mr_flags = uap->flags,
  440                 .mr_fd = uap->fd,
  441                 .mr_pos = uap->pos,
  442             }));
  443 }
  444 #endif
  445 
  446 #ifdef COMPAT_43
  447 #ifndef _SYS_SYSPROTO_H_
  448 struct ommap_args {
  449         caddr_t addr;
  450         int len;
  451         int prot;
  452         int flags;
  453         int fd;
  454         long pos;
  455 };
  456 #endif
  457 int
  458 ommap(struct thread *td, struct ommap_args *uap)
  459 {
  460         return (kern_ommap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
  461             uap->flags, uap->fd, uap->pos));
  462 }
  463 
  464 int
  465 kern_ommap(struct thread *td, uintptr_t hint, int len, int oprot,
  466     int oflags, int fd, long pos)
  467 {
  468         static const char cvtbsdprot[8] = {
  469                 0,
  470                 PROT_EXEC,
  471                 PROT_WRITE,
  472                 PROT_EXEC | PROT_WRITE,
  473                 PROT_READ,
  474                 PROT_EXEC | PROT_READ,
  475                 PROT_WRITE | PROT_READ,
  476                 PROT_EXEC | PROT_WRITE | PROT_READ,
  477         };
  478         int flags, prot;
  479 
  480         if (len < 0)
  481                 return (EINVAL);
  482 
  483 #define OMAP_ANON       0x0002
  484 #define OMAP_COPY       0x0020
  485 #define OMAP_SHARED     0x0010
  486 #define OMAP_FIXED      0x0100
  487 
  488         prot = cvtbsdprot[oprot & 0x7];
  489 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
  490         if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
  491             prot != 0)
  492                 prot |= PROT_EXEC;
  493 #endif
  494         flags = 0;
  495         if (oflags & OMAP_ANON)
  496                 flags |= MAP_ANON;
  497         if (oflags & OMAP_COPY)
  498                 flags |= MAP_COPY;
  499         if (oflags & OMAP_SHARED)
  500                 flags |= MAP_SHARED;
  501         else
  502                 flags |= MAP_PRIVATE;
  503         if (oflags & OMAP_FIXED)
  504                 flags |= MAP_FIXED;
  505         return (kern_mmap(td, &(struct mmap_req){
  506                 .mr_hint = hint,
  507                 .mr_len = len,
  508                 .mr_prot = prot,
  509                 .mr_flags = flags,
  510                 .mr_fd = fd,
  511                 .mr_pos = pos,
  512             }));
  513 }
  514 #endif                          /* COMPAT_43 */
  515 
  516 #ifndef _SYS_SYSPROTO_H_
  517 struct msync_args {
  518         void *addr;
  519         size_t len;
  520         int flags;
  521 };
  522 #endif
  523 int
  524 sys_msync(struct thread *td, struct msync_args *uap)
  525 {
  526 
  527         return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
  528 }
  529 
  530 int
  531 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
  532 {
  533         vm_offset_t addr;
  534         vm_size_t pageoff;
  535         vm_map_t map;
  536         int rv;
  537 
  538         addr = addr0;
  539         pageoff = (addr & PAGE_MASK);
  540         addr -= pageoff;
  541         size += pageoff;
  542         size = (vm_size_t) round_page(size);
  543         if (addr + size < addr)
  544                 return (EINVAL);
  545 
  546         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  547                 return (EINVAL);
  548 
  549         map = &td->td_proc->p_vmspace->vm_map;
  550 
  551         /*
  552          * Clean the pages and interpret the return value.
  553          */
  554         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  555             (flags & MS_INVALIDATE) != 0);
  556         switch (rv) {
  557         case KERN_SUCCESS:
  558                 return (0);
  559         case KERN_INVALID_ADDRESS:
  560                 return (ENOMEM);
  561         case KERN_INVALID_ARGUMENT:
  562                 return (EBUSY);
  563         case KERN_FAILURE:
  564                 return (EIO);
  565         default:
  566                 return (EINVAL);
  567         }
  568 }
  569 
  570 #ifndef _SYS_SYSPROTO_H_
  571 struct munmap_args {
  572         void *addr;
  573         size_t len;
  574 };
  575 #endif
  576 int
  577 sys_munmap(struct thread *td, struct munmap_args *uap)
  578 {
  579 
  580         return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
  581 }
  582 
  583 int
  584 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
  585 {
  586 #ifdef HWPMC_HOOKS
  587         struct pmckern_map_out pkm;
  588         vm_map_entry_t entry;
  589         bool pmc_handled;
  590 #endif
  591         vm_offset_t addr, end;
  592         vm_size_t pageoff;
  593         vm_map_t map;
  594         int rv;
  595 
  596         if (size == 0)
  597                 return (EINVAL);
  598 
  599         addr = addr0;
  600         pageoff = (addr & PAGE_MASK);
  601         addr -= pageoff;
  602         size += pageoff;
  603         size = (vm_size_t) round_page(size);
  604         end = addr + size;
  605         map = &td->td_proc->p_vmspace->vm_map;
  606         if (!vm_map_range_valid(map, addr, end))
  607                 return (EINVAL);
  608 
  609         vm_map_lock(map);
  610 #ifdef HWPMC_HOOKS
  611         pmc_handled = false;
  612         if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
  613                 pmc_handled = true;
  614                 /*
  615                  * Inform hwpmc if the address range being unmapped contains
  616                  * an executable region.
  617                  */
  618                 pkm.pm_address = (uintptr_t) NULL;
  619                 if (vm_map_lookup_entry(map, addr, &entry)) {
  620                         for (; entry->start < end;
  621                             entry = vm_map_entry_succ(entry)) {
  622                                 if (vm_map_check_protection(map, entry->start,
  623                                         entry->end, VM_PROT_EXECUTE) == TRUE) {
  624                                         pkm.pm_address = (uintptr_t) addr;
  625                                         pkm.pm_size = (size_t) size;
  626                                         break;
  627                                 }
  628                         }
  629                 }
  630         }
  631 #endif
  632         rv = vm_map_delete(map, addr, end);
  633 
  634 #ifdef HWPMC_HOOKS
  635         if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
  636                 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
  637                 vm_map_lock_downgrade(map);
  638                 if (pkm.pm_address != (uintptr_t) NULL)
  639                         PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
  640                 vm_map_unlock_read(map);
  641         } else
  642 #endif
  643                 vm_map_unlock(map);
  644 
  645         return (vm_mmap_to_errno(rv));
  646 }
  647 
  648 #ifndef _SYS_SYSPROTO_H_
  649 struct mprotect_args {
  650         const void *addr;
  651         size_t len;
  652         int prot;
  653 };
  654 #endif
  655 int
  656 sys_mprotect(struct thread *td, struct mprotect_args *uap)
  657 {
  658 
  659         return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
  660 }
  661 
  662 int
  663 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
  664 {
  665         vm_offset_t addr;
  666         vm_size_t pageoff;
  667         int vm_error, max_prot;
  668         int flags;
  669 
  670         addr = addr0;
  671         if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
  672                 return (EINVAL);
  673         max_prot = PROT_MAX_EXTRACT(prot);
  674         prot = PROT_EXTRACT(prot);
  675         pageoff = (addr & PAGE_MASK);
  676         addr -= pageoff;
  677         size += pageoff;
  678         size = (vm_size_t) round_page(size);
  679 #ifdef COMPAT_FREEBSD32
  680         if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
  681                 if (((addr + size) & 0xffffffff) < addr)
  682                         return (EINVAL);
  683         } else
  684 #endif
  685         if (addr + size < addr)
  686                 return (EINVAL);
  687 
  688         flags = VM_MAP_PROTECT_SET_PROT;
  689         if (max_prot != 0)
  690                 flags |= VM_MAP_PROTECT_SET_MAXPROT;
  691         vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
  692             addr, addr + size, prot, max_prot, flags);
  693 
  694         switch (vm_error) {
  695         case KERN_SUCCESS:
  696                 return (0);
  697         case KERN_PROTECTION_FAILURE:
  698                 return (EACCES);
  699         case KERN_RESOURCE_SHORTAGE:
  700                 return (ENOMEM);
  701         case KERN_OUT_OF_BOUNDS:
  702                 return (ENOTSUP);
  703         }
  704         return (EINVAL);
  705 }
  706 
  707 #ifndef _SYS_SYSPROTO_H_
  708 struct minherit_args {
  709         void *addr;
  710         size_t len;
  711         int inherit;
  712 };
  713 #endif
  714 int
  715 sys_minherit(struct thread *td, struct minherit_args *uap)
  716 {
  717 
  718         return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
  719             uap->inherit));
  720 }
  721 
  722 int
  723 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
  724 {
  725         vm_offset_t addr;
  726         vm_size_t size, pageoff;
  727         vm_inherit_t inherit;
  728 
  729         addr = (vm_offset_t)addr0;
  730         size = len;
  731         inherit = inherit0;
  732 
  733         pageoff = (addr & PAGE_MASK);
  734         addr -= pageoff;
  735         size += pageoff;
  736         size = (vm_size_t) round_page(size);
  737         if (addr + size < addr)
  738                 return (EINVAL);
  739 
  740         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  741             addr + size, inherit)) {
  742         case KERN_SUCCESS:
  743                 return (0);
  744         case KERN_PROTECTION_FAILURE:
  745                 return (EACCES);
  746         }
  747         return (EINVAL);
  748 }
  749 
  750 #ifndef _SYS_SYSPROTO_H_
  751 struct madvise_args {
  752         void *addr;
  753         size_t len;
  754         int behav;
  755 };
  756 #endif
  757 
  758 int
  759 sys_madvise(struct thread *td, struct madvise_args *uap)
  760 {
  761 
  762         return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
  763 }
  764 
  765 int
  766 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
  767 {
  768         vm_map_t map;
  769         vm_offset_t addr, end, start;
  770         int flags;
  771 
  772         /*
  773          * Check for our special case, advising the swap pager we are
  774          * "immortal."
  775          */
  776         if (behav == MADV_PROTECT) {
  777                 flags = PPROT_SET;
  778                 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
  779                     PROC_SPROTECT, &flags));
  780         }
  781 
  782         /*
  783          * Check for illegal addresses.  Watch out for address wrap... Note
  784          * that VM_*_ADDRESS are not constants due to casts (argh).
  785          */
  786         map = &td->td_proc->p_vmspace->vm_map;
  787         addr = addr0;
  788         if (!vm_map_range_valid(map, addr, addr + len))
  789                 return (EINVAL);
  790 
  791         /*
  792          * Since this routine is only advisory, we default to conservative
  793          * behavior.
  794          */
  795         start = trunc_page(addr);
  796         end = round_page(addr + len);
  797 
  798         /*
  799          * vm_map_madvise() checks for illegal values of behav.
  800          */
  801         return (vm_map_madvise(map, start, end, behav));
  802 }
  803 
  804 #ifndef _SYS_SYSPROTO_H_
  805 struct mincore_args {
  806         const void *addr;
  807         size_t len;
  808         char *vec;
  809 };
  810 #endif
  811 
  812 int
  813 sys_mincore(struct thread *td, struct mincore_args *uap)
  814 {
  815 
  816         return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
  817 }
  818 
  819 int
  820 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
  821 {
  822         pmap_t pmap;
  823         vm_map_t map;
  824         vm_map_entry_t current, entry;
  825         vm_object_t object;
  826         vm_offset_t addr, cend, end, first_addr;
  827         vm_paddr_t pa;
  828         vm_page_t m;
  829         vm_pindex_t pindex;
  830         int error, lastvecindex, mincoreinfo, vecindex;
  831         unsigned int timestamp;
  832 
  833         /*
  834          * Make sure that the addresses presented are valid for user
  835          * mode.
  836          */
  837         first_addr = addr = trunc_page(addr0);
  838         end = round_page(addr0 + len);
  839         map = &td->td_proc->p_vmspace->vm_map;
  840         if (end > vm_map_max(map) || end < addr)
  841                 return (ENOMEM);
  842 
  843         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  844 
  845         vm_map_lock_read(map);
  846 RestartScan:
  847         timestamp = map->timestamp;
  848 
  849         if (!vm_map_lookup_entry(map, addr, &entry)) {
  850                 vm_map_unlock_read(map);
  851                 return (ENOMEM);
  852         }
  853 
  854         /*
  855          * Do this on a map entry basis so that if the pages are not
  856          * in the current processes address space, we can easily look
  857          * up the pages elsewhere.
  858          */
  859         lastvecindex = -1;
  860         while (entry->start < end) {
  861                 /*
  862                  * check for contiguity
  863                  */
  864                 current = entry;
  865                 entry = vm_map_entry_succ(current);
  866                 if (current->end < end &&
  867                     entry->start > current->end) {
  868                         vm_map_unlock_read(map);
  869                         return (ENOMEM);
  870                 }
  871 
  872                 /*
  873                  * ignore submaps (for now) or null objects
  874                  */
  875                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  876                     current->object.vm_object == NULL)
  877                         continue;
  878 
  879                 /*
  880                  * limit this scan to the current map entry and the
  881                  * limits for the mincore call
  882                  */
  883                 if (addr < current->start)
  884                         addr = current->start;
  885                 cend = current->end;
  886                 if (cend > end)
  887                         cend = end;
  888 
  889                 for (; addr < cend; addr += PAGE_SIZE) {
  890                         /*
  891                          * Check pmap first, it is likely faster, also
  892                          * it can provide info as to whether we are the
  893                          * one referencing or modifying the page.
  894                          */
  895                         m = NULL;
  896                         object = NULL;
  897 retry:
  898                         pa = 0;
  899                         mincoreinfo = pmap_mincore(pmap, addr, &pa);
  900                         if (mincore_mapped) {
  901                                 /*
  902                                  * We only care about this pmap's
  903                                  * mapping of the page, if any.
  904                                  */
  905                                 ;
  906                         } else if (pa != 0) {
  907                                 /*
  908                                  * The page is mapped by this process but not
  909                                  * both accessed and modified.  It is also
  910                                  * managed.  Acquire the object lock so that
  911                                  * other mappings might be examined.  The page's
  912                                  * identity may change at any point before its
  913                                  * object lock is acquired, so re-validate if
  914                                  * necessary.
  915                                  */
  916                                 m = PHYS_TO_VM_PAGE(pa);
  917                                 while (object == NULL || m->object != object) {
  918                                         if (object != NULL)
  919                                                 VM_OBJECT_WUNLOCK(object);
  920                                         object = atomic_load_ptr(&m->object);
  921                                         if (object == NULL)
  922                                                 goto retry;
  923                                         VM_OBJECT_WLOCK(object);
  924                                 }
  925                                 if (pa != pmap_extract(pmap, addr))
  926                                         goto retry;
  927                                 KASSERT(vm_page_all_valid(m),
  928                                     ("mincore: page %p is mapped but invalid",
  929                                     m));
  930                         } else if (mincoreinfo == 0) {
  931                                 /*
  932                                  * The page is not mapped by this process.  If
  933                                  * the object implements managed pages, then
  934                                  * determine if the page is resident so that
  935                                  * the mappings might be examined.
  936                                  */
  937                                 if (current->object.vm_object != object) {
  938                                         if (object != NULL)
  939                                                 VM_OBJECT_WUNLOCK(object);
  940                                         object = current->object.vm_object;
  941                                         VM_OBJECT_WLOCK(object);
  942                                 }
  943                                 if ((object->flags & OBJ_SWAP) != 0 ||
  944                                     object->type == OBJT_VNODE) {
  945                                         pindex = OFF_TO_IDX(current->offset +
  946                                             (addr - current->start));
  947                                         m = vm_page_lookup(object, pindex);
  948                                         if (m != NULL && vm_page_none_valid(m))
  949                                                 m = NULL;
  950                                         if (m != NULL)
  951                                                 mincoreinfo = MINCORE_INCORE;
  952                                 }
  953                         }
  954                         if (m != NULL) {
  955                                 VM_OBJECT_ASSERT_WLOCKED(m->object);
  956 
  957                                 /* Examine other mappings of the page. */
  958                                 if (m->dirty == 0 && pmap_is_modified(m))
  959                                         vm_page_dirty(m);
  960                                 if (m->dirty != 0)
  961                                         mincoreinfo |= MINCORE_MODIFIED_OTHER;
  962 
  963                                 /*
  964                                  * The first test for PGA_REFERENCED is an
  965                                  * optimization.  The second test is
  966                                  * required because a concurrent pmap
  967                                  * operation could clear the last reference
  968                                  * and set PGA_REFERENCED before the call to
  969                                  * pmap_is_referenced(). 
  970                                  */
  971                                 if ((m->a.flags & PGA_REFERENCED) != 0 ||
  972                                     pmap_is_referenced(m) ||
  973                                     (m->a.flags & PGA_REFERENCED) != 0)
  974                                         mincoreinfo |= MINCORE_REFERENCED_OTHER;
  975                         }
  976                         if (object != NULL)
  977                                 VM_OBJECT_WUNLOCK(object);
  978 
  979                         /*
  980                          * subyte may page fault.  In case it needs to modify
  981                          * the map, we release the lock.
  982                          */
  983                         vm_map_unlock_read(map);
  984 
  985                         /*
  986                          * calculate index into user supplied byte vector
  987                          */
  988                         vecindex = atop(addr - first_addr);
  989 
  990                         /*
  991                          * If we have skipped map entries, we need to make sure that
  992                          * the byte vector is zeroed for those skipped entries.
  993                          */
  994                         while ((lastvecindex + 1) < vecindex) {
  995                                 ++lastvecindex;
  996                                 error = subyte(vec + lastvecindex, 0);
  997                                 if (error) {
  998                                         error = EFAULT;
  999                                         goto done2;
 1000                                 }
 1001                         }
 1002 
 1003                         /*
 1004                          * Pass the page information to the user
 1005                          */
 1006                         error = subyte(vec + vecindex, mincoreinfo);
 1007                         if (error) {
 1008                                 error = EFAULT;
 1009                                 goto done2;
 1010                         }
 1011 
 1012                         /*
 1013                          * If the map has changed, due to the subyte, the previous
 1014                          * output may be invalid.
 1015                          */
 1016                         vm_map_lock_read(map);
 1017                         if (timestamp != map->timestamp)
 1018                                 goto RestartScan;
 1019 
 1020                         lastvecindex = vecindex;
 1021                 }
 1022         }
 1023 
 1024         /*
 1025          * subyte may page fault.  In case it needs to modify
 1026          * the map, we release the lock.
 1027          */
 1028         vm_map_unlock_read(map);
 1029 
 1030         /*
 1031          * Zero the last entries in the byte vector.
 1032          */
 1033         vecindex = atop(end - first_addr);
 1034         while ((lastvecindex + 1) < vecindex) {
 1035                 ++lastvecindex;
 1036                 error = subyte(vec + lastvecindex, 0);
 1037                 if (error) {
 1038                         error = EFAULT;
 1039                         goto done2;
 1040                 }
 1041         }
 1042 
 1043         /*
 1044          * If the map has changed, due to the subyte, the previous
 1045          * output may be invalid.
 1046          */
 1047         vm_map_lock_read(map);
 1048         if (timestamp != map->timestamp)
 1049                 goto RestartScan;
 1050         vm_map_unlock_read(map);
 1051 done2:
 1052         return (error);
 1053 }
 1054 
 1055 #ifndef _SYS_SYSPROTO_H_
 1056 struct mlock_args {
 1057         const void *addr;
 1058         size_t len;
 1059 };
 1060 #endif
 1061 int
 1062 sys_mlock(struct thread *td, struct mlock_args *uap)
 1063 {
 1064 
 1065         return (kern_mlock(td->td_proc, td->td_ucred,
 1066             __DECONST(uintptr_t, uap->addr), uap->len));
 1067 }
 1068 
 1069 int
 1070 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 1071 {
 1072         vm_offset_t addr, end, last, start;
 1073         vm_size_t npages, size;
 1074         vm_map_t map;
 1075         unsigned long nsize;
 1076         int error;
 1077 
 1078         error = priv_check_cred(cred, PRIV_VM_MLOCK);
 1079         if (error)
 1080                 return (error);
 1081         addr = addr0;
 1082         size = len;
 1083         last = addr + size;
 1084         start = trunc_page(addr);
 1085         end = round_page(last);
 1086         if (last < addr || end < addr)
 1087                 return (EINVAL);
 1088         npages = atop(end - start);
 1089         if (npages > vm_page_max_user_wired)
 1090                 return (ENOMEM);
 1091         map = &proc->p_vmspace->vm_map;
 1092         PROC_LOCK(proc);
 1093         nsize = ptoa(npages + pmap_wired_count(map->pmap));
 1094         if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 1095                 PROC_UNLOCK(proc);
 1096                 return (ENOMEM);
 1097         }
 1098         PROC_UNLOCK(proc);
 1099 #ifdef RACCT
 1100         if (racct_enable) {
 1101                 PROC_LOCK(proc);
 1102                 error = racct_set(proc, RACCT_MEMLOCK, nsize);
 1103                 PROC_UNLOCK(proc);
 1104                 if (error != 0)
 1105                         return (ENOMEM);
 1106         }
 1107 #endif
 1108         error = vm_map_wire(map, start, end,
 1109             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1110 #ifdef RACCT
 1111         if (racct_enable && error != KERN_SUCCESS) {
 1112                 PROC_LOCK(proc);
 1113                 racct_set(proc, RACCT_MEMLOCK,
 1114                     ptoa(pmap_wired_count(map->pmap)));
 1115                 PROC_UNLOCK(proc);
 1116         }
 1117 #endif
 1118         switch (error) {
 1119         case KERN_SUCCESS:
 1120                 return (0);
 1121         case KERN_INVALID_ARGUMENT:
 1122                 return (EINVAL);
 1123         default:
 1124                 return (ENOMEM);
 1125         }
 1126 }
 1127 
 1128 #ifndef _SYS_SYSPROTO_H_
 1129 struct mlockall_args {
 1130         int     how;
 1131 };
 1132 #endif
 1133 
 1134 int
 1135 sys_mlockall(struct thread *td, struct mlockall_args *uap)
 1136 {
 1137         vm_map_t map;
 1138         int error;
 1139 
 1140         map = &td->td_proc->p_vmspace->vm_map;
 1141         error = priv_check(td, PRIV_VM_MLOCK);
 1142         if (error)
 1143                 return (error);
 1144 
 1145         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 1146                 return (EINVAL);
 1147 
 1148         /*
 1149          * If wiring all pages in the process would cause it to exceed
 1150          * a hard resource limit, return ENOMEM.
 1151          */
 1152         if (!old_mlock && uap->how & MCL_CURRENT) {
 1153                 if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
 1154                         return (ENOMEM);
 1155         }
 1156 #ifdef RACCT
 1157         if (racct_enable) {
 1158                 PROC_LOCK(td->td_proc);
 1159                 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 1160                 PROC_UNLOCK(td->td_proc);
 1161                 if (error != 0)
 1162                         return (ENOMEM);
 1163         }
 1164 #endif
 1165 
 1166         if (uap->how & MCL_FUTURE) {
 1167                 vm_map_lock(map);
 1168                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
 1169                 vm_map_unlock(map);
 1170                 error = 0;
 1171         }
 1172 
 1173         if (uap->how & MCL_CURRENT) {
 1174                 /*
 1175                  * P1003.1-2001 mandates that all currently mapped pages
 1176                  * will be memory resident and locked (wired) upon return
 1177                  * from mlockall(). vm_map_wire() will wire pages, by
 1178                  * calling vm_fault_wire() for each page in the region.
 1179                  */
 1180                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 1181                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1182                 if (error == KERN_SUCCESS)
 1183                         error = 0;
 1184                 else if (error == KERN_RESOURCE_SHORTAGE)
 1185                         error = ENOMEM;
 1186                 else
 1187                         error = EAGAIN;
 1188         }
 1189 #ifdef RACCT
 1190         if (racct_enable && error != KERN_SUCCESS) {
 1191                 PROC_LOCK(td->td_proc);
 1192                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1193                     ptoa(pmap_wired_count(map->pmap)));
 1194                 PROC_UNLOCK(td->td_proc);
 1195         }
 1196 #endif
 1197 
 1198         return (error);
 1199 }
 1200 
 1201 #ifndef _SYS_SYSPROTO_H_
 1202 struct munlockall_args {
 1203         register_t dummy;
 1204 };
 1205 #endif
 1206 
 1207 int
 1208 sys_munlockall(struct thread *td, struct munlockall_args *uap)
 1209 {
 1210         vm_map_t map;
 1211         int error;
 1212 
 1213         map = &td->td_proc->p_vmspace->vm_map;
 1214         error = priv_check(td, PRIV_VM_MUNLOCK);
 1215         if (error)
 1216                 return (error);
 1217 
 1218         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1219         vm_map_lock(map);
 1220         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1221         vm_map_unlock(map);
 1222 
 1223         /* Forcibly unwire all pages. */
 1224         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1225             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1226 #ifdef RACCT
 1227         if (racct_enable && error == KERN_SUCCESS) {
 1228                 PROC_LOCK(td->td_proc);
 1229                 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 1230                 PROC_UNLOCK(td->td_proc);
 1231         }
 1232 #endif
 1233 
 1234         return (error);
 1235 }
 1236 
 1237 #ifndef _SYS_SYSPROTO_H_
 1238 struct munlock_args {
 1239         const void *addr;
 1240         size_t len;
 1241 };
 1242 #endif
 1243 int
 1244 sys_munlock(struct thread *td, struct munlock_args *uap)
 1245 {
 1246 
 1247         return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
 1248 }
 1249 
 1250 int
 1251 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
 1252 {
 1253         vm_offset_t addr, end, last, start;
 1254 #ifdef RACCT
 1255         vm_map_t map;
 1256 #endif
 1257         int error;
 1258 
 1259         error = priv_check(td, PRIV_VM_MUNLOCK);
 1260         if (error)
 1261                 return (error);
 1262         addr = addr0;
 1263         last = addr + size;
 1264         start = trunc_page(addr);
 1265         end = round_page(last);
 1266         if (last < addr || end < addr)
 1267                 return (EINVAL);
 1268         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 1269             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1270 #ifdef RACCT
 1271         if (racct_enable && error == KERN_SUCCESS) {
 1272                 PROC_LOCK(td->td_proc);
 1273                 map = &td->td_proc->p_vmspace->vm_map;
 1274                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1275                     ptoa(pmap_wired_count(map->pmap)));
 1276                 PROC_UNLOCK(td->td_proc);
 1277         }
 1278 #endif
 1279         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1280 }
 1281 
 1282 /*
 1283  * vm_mmap_vnode()
 1284  *
 1285  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1286  * operations on vnodes.
 1287  */
 1288 int
 1289 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 1290     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1291     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
 1292     boolean_t *writecounted)
 1293 {
 1294         struct vattr va;
 1295         vm_object_t obj;
 1296         vm_ooffset_t foff;
 1297         struct ucred *cred;
 1298         int error, flags;
 1299         bool writex;
 1300 
 1301         cred = td->td_ucred;
 1302         writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
 1303             (*flagsp & MAP_SHARED) != 0;
 1304         if ((error = vget(vp, LK_SHARED)) != 0)
 1305                 return (error);
 1306         AUDIT_ARG_VNODE1(vp);
 1307         foff = *foffp;
 1308         flags = *flagsp;
 1309         obj = vp->v_object;
 1310         if (vp->v_type == VREG) {
 1311                 /*
 1312                  * Get the proper underlying object
 1313                  */
 1314                 if (obj == NULL) {
 1315                         error = EINVAL;
 1316                         goto done;
 1317                 }
 1318                 if (obj->type == OBJT_VNODE && obj->handle != vp) {
 1319                         vput(vp);
 1320                         vp = (struct vnode *)obj->handle;
 1321                         /*
 1322                          * Bypass filesystems obey the mpsafety of the
 1323                          * underlying fs.  Tmpfs never bypasses.
 1324                          */
 1325                         error = vget(vp, LK_SHARED);
 1326                         if (error != 0)
 1327                                 return (error);
 1328                 }
 1329                 if (writex) {
 1330                         *writecounted = TRUE;
 1331                         vm_pager_update_writecount(obj, 0, objsize);
 1332                 }
 1333         } else {
 1334                 error = EINVAL;
 1335                 goto done;
 1336         }
 1337         if ((error = VOP_GETATTR(vp, &va, cred)))
 1338                 goto done;
 1339 #ifdef MAC
 1340         /* This relies on VM_PROT_* matching PROT_*. */
 1341         error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 1342         if (error != 0)
 1343                 goto done;
 1344 #endif
 1345         if ((flags & MAP_SHARED) != 0) {
 1346                 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 1347                         if (prot & VM_PROT_WRITE) {
 1348                                 error = EPERM;
 1349                                 goto done;
 1350                         }
 1351                         *maxprotp &= ~VM_PROT_WRITE;
 1352                 }
 1353         }
 1354         /*
 1355          * If it is a regular file without any references
 1356          * we do not need to sync it.
 1357          * Adjust object size to be the size of actual file.
 1358          */
 1359         objsize = round_page(va.va_size);
 1360         if (va.va_nlink == 0)
 1361                 flags |= MAP_NOSYNC;
 1362         if (obj->type == OBJT_VNODE) {
 1363                 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 1364                     cred);
 1365                 if (obj == NULL) {
 1366                         error = ENOMEM;
 1367                         goto done;
 1368                 }
 1369         } else {
 1370                 KASSERT((obj->flags & OBJ_SWAP) != 0, ("wrong object type"));
 1371                 vm_object_reference(obj);
 1372 #if VM_NRESERVLEVEL > 0
 1373                 if ((obj->flags & OBJ_COLORED) == 0) {
 1374                         VM_OBJECT_WLOCK(obj);
 1375                         vm_object_color(obj, 0);
 1376                         VM_OBJECT_WUNLOCK(obj);
 1377                 }
 1378 #endif
 1379         }
 1380         *objp = obj;
 1381         *flagsp = flags;
 1382 
 1383         VOP_MMAPPED(vp);
 1384 
 1385 done:
 1386         if (error != 0 && *writecounted) {
 1387                 *writecounted = FALSE;
 1388                 vm_pager_update_writecount(obj, objsize, 0);
 1389         }
 1390         vput(vp);
 1391         return (error);
 1392 }
 1393 
 1394 /*
 1395  * vm_mmap_cdev()
 1396  *
 1397  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1398  * operations on cdevs.
 1399  */
 1400 int
 1401 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
 1402     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
 1403     vm_ooffset_t *foff, vm_object_t *objp)
 1404 {
 1405         vm_object_t obj;
 1406         int error, flags;
 1407 
 1408         flags = *flagsp;
 1409 
 1410         if (dsw->d_flags & D_MMAP_ANON) {
 1411                 *objp = NULL;
 1412                 *foff = 0;
 1413                 *maxprotp = VM_PROT_ALL;
 1414                 *flagsp |= MAP_ANON;
 1415                 return (0);
 1416         }
 1417         /*
 1418          * cdevs do not provide private mappings of any kind.
 1419          */
 1420         if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 1421             (prot & VM_PROT_WRITE) != 0)
 1422                 return (EACCES);
 1423         if (flags & (MAP_PRIVATE|MAP_COPY))
 1424                 return (EINVAL);
 1425         /*
 1426          * Force device mappings to be shared.
 1427          */
 1428         flags |= MAP_SHARED;
 1429 #ifdef MAC_XXX
 1430         error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
 1431         if (error != 0)
 1432                 return (error);
 1433 #endif
 1434         /*
 1435          * First, try d_mmap_single().  If that is not implemented
 1436          * (returns ENODEV), fall back to using the device pager.
 1437          * Note that d_mmap_single() must return a reference to the
 1438          * object (it needs to bump the reference count of the object
 1439          * it returns somehow).
 1440          *
 1441          * XXX assumes VM_PROT_* == PROT_*
 1442          */
 1443         error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 1444         if (error != ENODEV)
 1445                 return (error);
 1446         obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 1447             td->td_ucred);
 1448         if (obj == NULL)
 1449                 return (EINVAL);
 1450         *objp = obj;
 1451         *flagsp = flags;
 1452         return (0);
 1453 }
 1454 
 1455 int
 1456 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1457         vm_prot_t maxprot, int flags,
 1458         objtype_t handle_type, void *handle,
 1459         vm_ooffset_t foff)
 1460 {
 1461         vm_object_t object;
 1462         struct thread *td = curthread;
 1463         int error;
 1464         boolean_t writecounted;
 1465 
 1466         if (size == 0)
 1467                 return (EINVAL);
 1468 
 1469         size = round_page(size);
 1470         object = NULL;
 1471         writecounted = FALSE;
 1472 
 1473         switch (handle_type) {
 1474         case OBJT_DEVICE: {
 1475                 struct cdevsw *dsw;
 1476                 struct cdev *cdev;
 1477                 int ref;
 1478 
 1479                 cdev = handle;
 1480                 dsw = dev_refthread(cdev, &ref);
 1481                 if (dsw == NULL)
 1482                         return (ENXIO);
 1483                 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
 1484                     dsw, &foff, &object);
 1485                 dev_relthread(cdev, ref);
 1486                 break;
 1487         }
 1488         case OBJT_VNODE:
 1489                 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 1490                     handle, &foff, &object, &writecounted);
 1491                 break;
 1492         default:
 1493                 error = EINVAL;
 1494                 break;
 1495         }
 1496         if (error)
 1497                 return (error);
 1498 
 1499         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 1500             foff, writecounted, td);
 1501         if (error != 0 && object != NULL) {
 1502                 /*
 1503                  * If this mapping was accounted for in the vnode's
 1504                  * writecount, then undo that now.
 1505                  */
 1506                 if (writecounted)
 1507                         vm_pager_release_writecount(object, 0, size);
 1508                 vm_object_deallocate(object);
 1509         }
 1510         return (error);
 1511 }
 1512 
 1513 int
 1514 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
 1515 {
 1516         int error;
 1517 
 1518         RACCT_PROC_LOCK(td->td_proc);
 1519         if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
 1520                 RACCT_PROC_UNLOCK(td->td_proc);
 1521                 return (ENOMEM);
 1522         }
 1523         if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 1524                 RACCT_PROC_UNLOCK(td->td_proc);
 1525                 return (ENOMEM);
 1526         }
 1527         if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 1528                 if (ptoa(pmap_wired_count(map->pmap)) + size >
 1529                     lim_cur(td, RLIMIT_MEMLOCK)) {
 1530                         racct_set_force(td->td_proc, RACCT_VMEM, map->size);
 1531                         RACCT_PROC_UNLOCK(td->td_proc);
 1532                         return (ENOMEM);
 1533                 }
 1534                 error = racct_set(td->td_proc, RACCT_MEMLOCK,
 1535                     ptoa(pmap_wired_count(map->pmap)) + size);
 1536                 if (error != 0) {
 1537                         racct_set_force(td->td_proc, RACCT_VMEM, map->size);
 1538                         RACCT_PROC_UNLOCK(td->td_proc);
 1539                         return (error);
 1540                 }
 1541         }
 1542         RACCT_PROC_UNLOCK(td->td_proc);
 1543         return (0);
 1544 }
 1545 
 1546 /*
 1547  * Internal version of mmap that maps a specific VM object into an
 1548  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
 1549  */
 1550 int
 1551 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1552     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
 1553     boolean_t writecounted, struct thread *td)
 1554 {
 1555         vm_offset_t max_addr;
 1556         int docow, error, findspace, rv;
 1557         bool curmap, fitit;
 1558 
 1559         curmap = map == &td->td_proc->p_vmspace->vm_map;
 1560         if (curmap) {
 1561                 error = kern_mmap_racct_check(td, map, size);
 1562                 if (error != 0)
 1563                         return (error);
 1564         }
 1565 
 1566         /*
 1567          * We currently can only deal with page aligned file offsets.
 1568          * The mmap() system call already enforces this by subtracting
 1569          * the page offset from the file offset, but checking here
 1570          * catches errors in device drivers (e.g. d_single_mmap()
 1571          * callbacks) and other internal mapping requests (such as in
 1572          * exec).
 1573          */
 1574         if (foff & PAGE_MASK)
 1575                 return (EINVAL);
 1576 
 1577         if ((flags & MAP_FIXED) == 0) {
 1578                 fitit = TRUE;
 1579                 *addr = round_page(*addr);
 1580         } else {
 1581                 if (*addr != trunc_page(*addr))
 1582                         return (EINVAL);
 1583                 fitit = FALSE;
 1584         }
 1585 
 1586         if (flags & MAP_ANON) {
 1587                 if (object != NULL || foff != 0)
 1588                         return (EINVAL);
 1589                 docow = 0;
 1590         } else if (flags & MAP_PREFAULT_READ)
 1591                 docow = MAP_PREFAULT;
 1592         else
 1593                 docow = MAP_PREFAULT_PARTIAL;
 1594 
 1595         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1596                 docow |= MAP_COPY_ON_WRITE;
 1597         if (flags & MAP_NOSYNC)
 1598                 docow |= MAP_DISABLE_SYNCER;
 1599         if (flags & MAP_NOCORE)
 1600                 docow |= MAP_DISABLE_COREDUMP;
 1601         /* Shared memory is also shared with children. */
 1602         if (flags & MAP_SHARED)
 1603                 docow |= MAP_INHERIT_SHARE;
 1604         if (writecounted)
 1605                 docow |= MAP_WRITECOUNT;
 1606         if (flags & MAP_STACK) {
 1607                 if (object != NULL)
 1608                         return (EINVAL);
 1609                 docow |= MAP_STACK_GROWS_DOWN;
 1610         }
 1611         if ((flags & MAP_EXCL) != 0)
 1612                 docow |= MAP_CHECK_EXCL;
 1613         if ((flags & MAP_GUARD) != 0)
 1614                 docow |= MAP_CREATE_GUARD;
 1615 
 1616         if (fitit) {
 1617                 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 1618                         findspace = VMFS_SUPER_SPACE;
 1619                 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 1620                         findspace = VMFS_ALIGNED_SPACE(flags >>
 1621                             MAP_ALIGNMENT_SHIFT);
 1622                 else
 1623                         findspace = VMFS_OPTIMAL_SPACE;
 1624                 max_addr = 0;
 1625 #ifdef MAP_32BIT
 1626                 if ((flags & MAP_32BIT) != 0)
 1627                         max_addr = MAP_32BIT_MAX_ADDR;
 1628 #endif
 1629                 if (curmap) {
 1630                         rv = vm_map_find_min(map, object, foff, addr, size,
 1631                             round_page((vm_offset_t)td->td_proc->p_vmspace->
 1632                             vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
 1633                             findspace, prot, maxprot, docow);
 1634                 } else {
 1635                         rv = vm_map_find(map, object, foff, addr, size,
 1636                             max_addr, findspace, prot, maxprot, docow);
 1637                 }
 1638         } else {
 1639                 rv = vm_map_fixed(map, object, foff, *addr, size,
 1640                     prot, maxprot, docow);
 1641         }
 1642 
 1643         if (rv == KERN_SUCCESS) {
 1644                 /*
 1645                  * If the process has requested that all future mappings
 1646                  * be wired, then heed this.
 1647                  */
 1648                 if ((map->flags & MAP_WIREFUTURE) != 0) {
 1649                         vm_map_lock(map);
 1650                         if ((map->flags & MAP_WIREFUTURE) != 0)
 1651                                 (void)vm_map_wire_locked(map, *addr,
 1652                                     *addr + size, VM_MAP_WIRE_USER |
 1653                                     ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
 1654                                     VM_MAP_WIRE_NOHOLES));
 1655                         vm_map_unlock(map);
 1656                 }
 1657         }
 1658         return (vm_mmap_to_errno(rv));
 1659 }
 1660 
 1661 /*
 1662  * Translate a Mach VM return code to zero on success or the appropriate errno
 1663  * on failure.
 1664  */
 1665 int
 1666 vm_mmap_to_errno(int rv)
 1667 {
 1668 
 1669         switch (rv) {
 1670         case KERN_SUCCESS:
 1671                 return (0);
 1672         case KERN_INVALID_ADDRESS:
 1673         case KERN_NO_SPACE:
 1674                 return (ENOMEM);
 1675         case KERN_PROTECTION_FAILURE:
 1676                 return (EACCES);
 1677         default:
 1678                 return (EINVAL);
 1679         }
 1680 }

Cache object: 9352466ac53e87872d6f4996b12a1549


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.