The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1988 University of Utah.
    5  * Copyright (c) 1991, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   37  *
   38  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   39  */
   40 
   41 /*
   42  * Mapped file (mmap) interface to VM
   43  */
   44 
   45 #include <sys/cdefs.h>
   46 __FBSDID("$FreeBSD$");
   47 
   48 #include "opt_hwpmc_hooks.h"
   49 #include "opt_vm.h"
   50 
   51 #include <sys/param.h>
   52 #include <sys/systm.h>
   53 #include <sys/capsicum.h>
   54 #include <sys/kernel.h>
   55 #include <sys/lock.h>
   56 #include <sys/mutex.h>
   57 #include <sys/sysproto.h>
   58 #include <sys/elf.h>
   59 #include <sys/filedesc.h>
   60 #include <sys/priv.h>
   61 #include <sys/proc.h>
   62 #include <sys/procctl.h>
   63 #include <sys/racct.h>
   64 #include <sys/resource.h>
   65 #include <sys/resourcevar.h>
   66 #include <sys/rwlock.h>
   67 #include <sys/sysctl.h>
   68 #include <sys/vnode.h>
   69 #include <sys/fcntl.h>
   70 #include <sys/file.h>
   71 #include <sys/mman.h>
   72 #include <sys/mount.h>
   73 #include <sys/conf.h>
   74 #include <sys/stat.h>
   75 #include <sys/syscallsubr.h>
   76 #include <sys/sysent.h>
   77 #include <sys/vmmeter.h>
   78 #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */
   79 #include <machine/md_var.h>
   80 #endif
   81 
   82 #include <security/audit/audit.h>
   83 #include <security/mac/mac_framework.h>
   84 
   85 #include <vm/vm.h>
   86 #include <vm/vm_param.h>
   87 #include <vm/pmap.h>
   88 #include <vm/vm_map.h>
   89 #include <vm/vm_object.h>
   90 #include <vm/vm_page.h>
   91 #include <vm/vm_pager.h>
   92 #include <vm/vm_pageout.h>
   93 #include <vm/vm_extern.h>
   94 #include <vm/vm_page.h>
   95 #include <vm/vnode_pager.h>
   96 
   97 #ifdef HWPMC_HOOKS
   98 #include <sys/pmckern.h>
   99 #endif
  100 
  101 int old_mlock = 0;
  102 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0,
  103     "Do not apply RLIMIT_MEMLOCK on mlockall");
  104 static int mincore_mapped = 1;
  105 SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0,
  106     "mincore reports mappings, not residency");
  107 static int imply_prot_max = 0;
  108 SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0,
  109     "Imply maximum page protections in mmap() when none are specified");
  110 
  111 #ifdef MAP_32BIT
  112 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
  113 #endif
  114 
  115 _Static_assert(MAXPAGESIZES <= 4, "MINCORE_SUPER too narrow");
  116 
  117 #ifndef _SYS_SYSPROTO_H_
  118 struct sbrk_args {
  119         int incr;
  120 };
  121 #endif
  122 
  123 int
  124 sys_sbrk(struct thread *td, struct sbrk_args *uap)
  125 {
  126         /* Not yet implemented */
  127         return (EOPNOTSUPP);
  128 }
  129 
  130 #ifndef _SYS_SYSPROTO_H_
  131 struct sstk_args {
  132         int incr;
  133 };
  134 #endif
  135 
  136 int
  137 sys_sstk(struct thread *td, struct sstk_args *uap)
  138 {
  139         /* Not yet implemented */
  140         return (EOPNOTSUPP);
  141 }
  142 
  143 #if defined(COMPAT_43)
  144 int
  145 ogetpagesize(struct thread *td, struct ogetpagesize_args *uap)
  146 {
  147 
  148         td->td_retval[0] = PAGE_SIZE;
  149         return (0);
  150 }
  151 #endif                          /* COMPAT_43 */
  152 
  153 /*
  154  * Memory Map (mmap) system call.  Note that the file offset
  155  * and address are allowed to be NOT page aligned, though if
  156  * the MAP_FIXED flag it set, both must have the same remainder
  157  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  158  * page-aligned, the actual mapping starts at trunc_page(addr)
  159  * and the return value is adjusted up by the page offset.
  160  *
  161  * Generally speaking, only character devices which are themselves
  162  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  163  * there would be no cache coherency between a descriptor and a VM mapping
  164  * both to the same character device.
  165  */
  166 #ifndef _SYS_SYSPROTO_H_
  167 struct mmap_args {
  168         void *addr;
  169         size_t len;
  170         int prot;
  171         int flags;
  172         int fd;
  173         long pad;
  174         off_t pos;
  175 };
  176 #endif
  177 
  178 int
  179 sys_mmap(struct thread *td, struct mmap_args *uap)
  180 {
  181 
  182         return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
  183             uap->flags, uap->fd, uap->pos));
  184 }
  185 
  186 int
  187 kern_mmap_maxprot(struct proc *p, int prot)
  188 {
  189 
  190         if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 ||
  191             (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0)
  192                 return (_PROT_ALL);
  193         if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) &&
  194             prot != PROT_NONE)
  195                  return (prot);
  196         return (_PROT_ALL);
  197 }
  198 
  199 int
  200 kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags,
  201     int fd, off_t pos)
  202 {
  203         struct mmap_req mr = {
  204                 .mr_hint = addr0,
  205                 .mr_len = len,
  206                 .mr_prot = prot,
  207                 .mr_flags = flags,
  208                 .mr_fd = fd,
  209                 .mr_pos = pos
  210         };
  211 
  212         return (kern_mmap_req(td, &mr));
  213 }
  214 
  215 int
  216 kern_mmap_req(struct thread *td, const struct mmap_req *mrp)
  217 {
  218         struct vmspace *vms;
  219         struct file *fp;
  220         struct proc *p;
  221         off_t pos;
  222         vm_offset_t addr, orig_addr;
  223         vm_size_t len, pageoff, size;
  224         vm_prot_t cap_maxprot;
  225         int align, error, fd, flags, max_prot, prot;
  226         cap_rights_t rights;
  227         mmap_check_fp_fn check_fp_fn;
  228 
  229         orig_addr = addr = mrp->mr_hint;
  230         len = mrp->mr_len;
  231         prot = mrp->mr_prot;
  232         flags = mrp->mr_flags;
  233         fd = mrp->mr_fd;
  234         pos = mrp->mr_pos;
  235         check_fp_fn = mrp->mr_check_fp_fn;
  236 
  237         if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
  238                 return (EINVAL);
  239         max_prot = PROT_MAX_EXTRACT(prot);
  240         prot = PROT_EXTRACT(prot);
  241         if (max_prot != 0 && (max_prot & prot) != prot)
  242                 return (ENOTSUP);
  243 
  244         p = td->td_proc;
  245 
  246         /*
  247          * Always honor PROT_MAX if set.  If not, default to all
  248          * permissions unless we're implying maximum permissions.
  249          */
  250         if (max_prot == 0)
  251                 max_prot = kern_mmap_maxprot(p, prot);
  252 
  253         vms = p->p_vmspace;
  254         fp = NULL;
  255         AUDIT_ARG_FD(fd);
  256 
  257         /*
  258          * Ignore old flags that used to be defined but did not do anything.
  259          */
  260         flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040);
  261 
  262         /*
  263          * Enforce the constraints.
  264          * Mapping of length 0 is only allowed for old binaries.
  265          * Anonymous mapping shall specify -1 as filedescriptor and
  266          * zero position for new code. Be nice to ancient a.out
  267          * binaries and correct pos for anonymous mapping, since old
  268          * ld.so sometimes issues anonymous map requests with non-zero
  269          * pos.
  270          */
  271         if (!SV_CURPROC_FLAG(SV_AOUT)) {
  272                 if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) ||
  273                     ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0)))
  274                         return (EINVAL);
  275         } else {
  276                 if ((flags & MAP_ANON) != 0)
  277                         pos = 0;
  278         }
  279 
  280         if (flags & MAP_STACK) {
  281                 if ((fd != -1) ||
  282                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  283                         return (EINVAL);
  284                 flags |= MAP_ANON;
  285                 pos = 0;
  286         }
  287         if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE |
  288             MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE |
  289             MAP_PREFAULT_READ | MAP_GUARD |
  290 #ifdef MAP_32BIT
  291             MAP_32BIT |
  292 #endif
  293             MAP_ALIGNMENT_MASK)) != 0)
  294                 return (EINVAL);
  295         if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
  296                 return (EINVAL);
  297         if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE))
  298                 return (EINVAL);
  299         if (prot != PROT_NONE &&
  300             (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0)
  301                 return (EINVAL);
  302         if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 ||
  303             pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL |
  304 #ifdef MAP_32BIT
  305             MAP_32BIT |
  306 #endif
  307             MAP_ALIGNMENT_MASK)) != 0))
  308                 return (EINVAL);
  309 
  310         /*
  311          * Align the file position to a page boundary,
  312          * and save its page offset component.
  313          */
  314         pageoff = (pos & PAGE_MASK);
  315         pos -= pageoff;
  316 
  317         /* Compute size from len by rounding (on both ends). */
  318         size = len + pageoff;                   /* low end... */
  319         size = round_page(size);                /* hi end */
  320         /* Check for rounding up to zero. */
  321         if (len > size)
  322                 return (ENOMEM);
  323 
  324         /* Ensure alignment is at least a page and fits in a pointer. */
  325         align = flags & MAP_ALIGNMENT_MASK;
  326         if (align != 0 && align != MAP_ALIGNED_SUPER &&
  327             (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
  328             align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
  329                 return (EINVAL);
  330 
  331         /*
  332          * Check for illegal addresses.  Watch out for address wrap... Note
  333          * that VM_*_ADDRESS are not constants due to casts (argh).
  334          */
  335         if (flags & MAP_FIXED) {
  336                 /*
  337                  * The specified address must have the same remainder
  338                  * as the file offset taken modulo PAGE_SIZE, so it
  339                  * should be aligned after adjustment by pageoff.
  340                  */
  341                 addr -= pageoff;
  342                 if (addr & PAGE_MASK)
  343                         return (EINVAL);
  344 
  345                 /* Address range must be all in user VM space. */
  346                 if (!vm_map_range_valid(&vms->vm_map, addr, addr + size))
  347                         return (EINVAL);
  348 #ifdef MAP_32BIT
  349                 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
  350                         return (EINVAL);
  351         } else if (flags & MAP_32BIT) {
  352                 /*
  353                  * For MAP_32BIT, override the hint if it is too high and
  354                  * do not bother moving the mapping past the heap (since
  355                  * the heap is usually above 2GB).
  356                  */
  357                 if (addr + size > MAP_32BIT_MAX_ADDR)
  358                         addr = 0;
  359 #endif
  360         } else {
  361                 /*
  362                  * XXX for non-fixed mappings where no hint is provided or
  363                  * the hint would fall in the potential heap space,
  364                  * place it after the end of the largest possible heap.
  365                  *
  366                  * There should really be a pmap call to determine a reasonable
  367                  * location.
  368                  */
  369                 if (addr == 0 ||
  370                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  371                     addr < round_page((vm_offset_t)vms->vm_daddr +
  372                     lim_max(td, RLIMIT_DATA))))
  373                         addr = round_page((vm_offset_t)vms->vm_daddr +
  374                             lim_max(td, RLIMIT_DATA));
  375         }
  376         if (len == 0) {
  377                 /*
  378                  * Return success without mapping anything for old
  379                  * binaries that request a page-aligned mapping of
  380                  * length 0.  For modern binaries, this function
  381                  * returns an error earlier.
  382                  */
  383                 error = 0;
  384         } else if ((flags & MAP_GUARD) != 0) {
  385                 error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE,
  386                     VM_PROT_NONE, flags, NULL, pos, FALSE, td);
  387         } else if ((flags & MAP_ANON) != 0) {
  388                 /*
  389                  * Mapping blank space is trivial.
  390                  *
  391                  * This relies on VM_PROT_* matching PROT_*.
  392                  */
  393                 error = vm_mmap_object(&vms->vm_map, &addr, size, prot,
  394                     max_prot, flags, NULL, pos, FALSE, td);
  395         } else {
  396                 /*
  397                  * Mapping file, get fp for validation and don't let the
  398                  * descriptor disappear on us if we block. Check capability
  399                  * rights, but also return the maximum rights to be combined
  400                  * with maxprot later.
  401                  */
  402                 cap_rights_init_one(&rights, CAP_MMAP);
  403                 if (prot & PROT_READ)
  404                         cap_rights_set_one(&rights, CAP_MMAP_R);
  405                 if ((flags & MAP_SHARED) != 0) {
  406                         if (prot & PROT_WRITE)
  407                                 cap_rights_set_one(&rights, CAP_MMAP_W);
  408                 }
  409                 if (prot & PROT_EXEC)
  410                         cap_rights_set_one(&rights, CAP_MMAP_X);
  411                 error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp);
  412                 if (error != 0)
  413                         goto done;
  414                 if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 &&
  415                     p->p_osrel >= P_OSREL_MAP_FSTRICT) {
  416                         error = EINVAL;
  417                         goto done;
  418                 }
  419                 if (check_fp_fn != NULL) {
  420                         error = check_fp_fn(fp, prot, max_prot & cap_maxprot,
  421                             flags);
  422                         if (error != 0)
  423                                 goto done;
  424                 }
  425                 if (fp->f_ops == &shm_ops && shm_largepage(fp->f_data))
  426                         addr = orig_addr;
  427                 /* This relies on VM_PROT_* matching PROT_*. */
  428                 error = fo_mmap(fp, &vms->vm_map, &addr, size, prot,
  429                     max_prot & cap_maxprot, flags, pos, td);
  430         }
  431 
  432         if (error == 0)
  433                 td->td_retval[0] = (register_t) (addr + pageoff);
  434 done:
  435         if (fp)
  436                 fdrop(fp, td);
  437 
  438         return (error);
  439 }
  440 
  441 #if defined(COMPAT_FREEBSD6)
  442 int
  443 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
  444 {
  445 
  446         return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot,
  447             uap->flags, uap->fd, uap->pos));
  448 }
  449 #endif
  450 
  451 #ifdef COMPAT_43
  452 #ifndef _SYS_SYSPROTO_H_
  453 struct ommap_args {
  454         caddr_t addr;
  455         int len;
  456         int prot;
  457         int flags;
  458         int fd;
  459         long pos;
  460 };
  461 #endif
  462 int
  463 ommap(struct thread *td, struct ommap_args *uap)
  464 {
  465         static const char cvtbsdprot[8] = {
  466                 0,
  467                 PROT_EXEC,
  468                 PROT_WRITE,
  469                 PROT_EXEC | PROT_WRITE,
  470                 PROT_READ,
  471                 PROT_EXEC | PROT_READ,
  472                 PROT_WRITE | PROT_READ,
  473                 PROT_EXEC | PROT_WRITE | PROT_READ,
  474         };
  475         int flags, prot;
  476 
  477 #define OMAP_ANON       0x0002
  478 #define OMAP_COPY       0x0020
  479 #define OMAP_SHARED     0x0010
  480 #define OMAP_FIXED      0x0100
  481 
  482         prot = cvtbsdprot[uap->prot & 0x7];
  483 #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__)
  484         if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
  485             prot != 0)
  486                 prot |= PROT_EXEC;
  487 #endif
  488         flags = 0;
  489         if (uap->flags & OMAP_ANON)
  490                 flags |= MAP_ANON;
  491         if (uap->flags & OMAP_COPY)
  492                 flags |= MAP_COPY;
  493         if (uap->flags & OMAP_SHARED)
  494                 flags |= MAP_SHARED;
  495         else
  496                 flags |= MAP_PRIVATE;
  497         if (uap->flags & OMAP_FIXED)
  498                 flags |= MAP_FIXED;
  499         return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags,
  500             uap->fd, uap->pos));
  501 }
  502 #endif                          /* COMPAT_43 */
  503 
  504 #ifndef _SYS_SYSPROTO_H_
  505 struct msync_args {
  506         void *addr;
  507         size_t len;
  508         int flags;
  509 };
  510 #endif
  511 int
  512 sys_msync(struct thread *td, struct msync_args *uap)
  513 {
  514 
  515         return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags));
  516 }
  517 
  518 int
  519 kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags)
  520 {
  521         vm_offset_t addr;
  522         vm_size_t pageoff;
  523         vm_map_t map;
  524         int rv;
  525 
  526         addr = addr0;
  527         pageoff = (addr & PAGE_MASK);
  528         addr -= pageoff;
  529         size += pageoff;
  530         size = (vm_size_t) round_page(size);
  531         if (addr + size < addr)
  532                 return (EINVAL);
  533 
  534         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  535                 return (EINVAL);
  536 
  537         map = &td->td_proc->p_vmspace->vm_map;
  538 
  539         /*
  540          * Clean the pages and interpret the return value.
  541          */
  542         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  543             (flags & MS_INVALIDATE) != 0);
  544         switch (rv) {
  545         case KERN_SUCCESS:
  546                 return (0);
  547         case KERN_INVALID_ADDRESS:
  548                 return (ENOMEM);
  549         case KERN_INVALID_ARGUMENT:
  550                 return (EBUSY);
  551         case KERN_FAILURE:
  552                 return (EIO);
  553         default:
  554                 return (EINVAL);
  555         }
  556 }
  557 
  558 #ifndef _SYS_SYSPROTO_H_
  559 struct munmap_args {
  560         void *addr;
  561         size_t len;
  562 };
  563 #endif
  564 int
  565 sys_munmap(struct thread *td, struct munmap_args *uap)
  566 {
  567 
  568         return (kern_munmap(td, (uintptr_t)uap->addr, uap->len));
  569 }
  570 
  571 int
  572 kern_munmap(struct thread *td, uintptr_t addr0, size_t size)
  573 {
  574 #ifdef HWPMC_HOOKS
  575         struct pmckern_map_out pkm;
  576         vm_map_entry_t entry;
  577         bool pmc_handled;
  578 #endif
  579         vm_offset_t addr, end;
  580         vm_size_t pageoff;
  581         vm_map_t map;
  582         int rv;
  583 
  584         if (size == 0)
  585                 return (EINVAL);
  586 
  587         addr = addr0;
  588         pageoff = (addr & PAGE_MASK);
  589         addr -= pageoff;
  590         size += pageoff;
  591         size = (vm_size_t) round_page(size);
  592         end = addr + size;
  593         map = &td->td_proc->p_vmspace->vm_map;
  594         if (!vm_map_range_valid(map, addr, end))
  595                 return (EINVAL);
  596 
  597         vm_map_lock(map);
  598 #ifdef HWPMC_HOOKS
  599         pmc_handled = false;
  600         if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) {
  601                 pmc_handled = true;
  602                 /*
  603                  * Inform hwpmc if the address range being unmapped contains
  604                  * an executable region.
  605                  */
  606                 pkm.pm_address = (uintptr_t) NULL;
  607                 if (vm_map_lookup_entry(map, addr, &entry)) {
  608                         for (; entry->start < end;
  609                             entry = vm_map_entry_succ(entry)) {
  610                                 if (vm_map_check_protection(map, entry->start,
  611                                         entry->end, VM_PROT_EXECUTE) == TRUE) {
  612                                         pkm.pm_address = (uintptr_t) addr;
  613                                         pkm.pm_size = (size_t) size;
  614                                         break;
  615                                 }
  616                         }
  617                 }
  618         }
  619 #endif
  620         rv = vm_map_delete(map, addr, end);
  621 
  622 #ifdef HWPMC_HOOKS
  623         if (rv == KERN_SUCCESS && __predict_false(pmc_handled)) {
  624                 /* downgrade the lock to prevent a LOR with the pmc-sx lock */
  625                 vm_map_lock_downgrade(map);
  626                 if (pkm.pm_address != (uintptr_t) NULL)
  627                         PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
  628                 vm_map_unlock_read(map);
  629         } else
  630 #endif
  631                 vm_map_unlock(map);
  632 
  633         return (vm_mmap_to_errno(rv));
  634 }
  635 
  636 #ifndef _SYS_SYSPROTO_H_
  637 struct mprotect_args {
  638         const void *addr;
  639         size_t len;
  640         int prot;
  641 };
  642 #endif
  643 int
  644 sys_mprotect(struct thread *td, struct mprotect_args *uap)
  645 {
  646 
  647         return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot));
  648 }
  649 
  650 int
  651 kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot)
  652 {
  653         vm_offset_t addr;
  654         vm_size_t pageoff;
  655         int vm_error, max_prot;
  656         int flags;
  657 
  658         addr = addr0;
  659         if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0)
  660                 return (EINVAL);
  661         max_prot = PROT_MAX_EXTRACT(prot);
  662         prot = PROT_EXTRACT(prot);
  663         pageoff = (addr & PAGE_MASK);
  664         addr -= pageoff;
  665         size += pageoff;
  666         size = (vm_size_t) round_page(size);
  667 #ifdef COMPAT_FREEBSD32
  668         if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
  669                 if (((addr + size) & 0xffffffff) < addr)
  670                         return (EINVAL);
  671         } else
  672 #endif
  673         if (addr + size < addr)
  674                 return (EINVAL);
  675 
  676         flags = VM_MAP_PROTECT_SET_PROT;
  677         if (max_prot != 0)
  678                 flags |= VM_MAP_PROTECT_SET_MAXPROT;
  679         vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map,
  680             addr, addr + size, prot, max_prot, flags);
  681 
  682         switch (vm_error) {
  683         case KERN_SUCCESS:
  684                 return (0);
  685         case KERN_PROTECTION_FAILURE:
  686                 return (EACCES);
  687         case KERN_RESOURCE_SHORTAGE:
  688                 return (ENOMEM);
  689         case KERN_OUT_OF_BOUNDS:
  690                 return (ENOTSUP);
  691         }
  692         return (EINVAL);
  693 }
  694 
  695 #ifndef _SYS_SYSPROTO_H_
  696 struct minherit_args {
  697         void *addr;
  698         size_t len;
  699         int inherit;
  700 };
  701 #endif
  702 int
  703 sys_minherit(struct thread *td, struct minherit_args *uap)
  704 {
  705 
  706         return (kern_minherit(td, (uintptr_t)uap->addr, uap->len,
  707             uap->inherit));
  708 }
  709 
  710 int
  711 kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0)
  712 {
  713         vm_offset_t addr;
  714         vm_size_t size, pageoff;
  715         vm_inherit_t inherit;
  716 
  717         addr = (vm_offset_t)addr0;
  718         size = len;
  719         inherit = inherit0;
  720 
  721         pageoff = (addr & PAGE_MASK);
  722         addr -= pageoff;
  723         size += pageoff;
  724         size = (vm_size_t) round_page(size);
  725         if (addr + size < addr)
  726                 return (EINVAL);
  727 
  728         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  729             addr + size, inherit)) {
  730         case KERN_SUCCESS:
  731                 return (0);
  732         case KERN_PROTECTION_FAILURE:
  733                 return (EACCES);
  734         }
  735         return (EINVAL);
  736 }
  737 
  738 #ifndef _SYS_SYSPROTO_H_
  739 struct madvise_args {
  740         void *addr;
  741         size_t len;
  742         int behav;
  743 };
  744 #endif
  745 
  746 int
  747 sys_madvise(struct thread *td, struct madvise_args *uap)
  748 {
  749 
  750         return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav));
  751 }
  752 
  753 int
  754 kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav)
  755 {
  756         vm_map_t map;
  757         vm_offset_t addr, end, start;
  758         int flags;
  759 
  760         /*
  761          * Check for our special case, advising the swap pager we are
  762          * "immortal."
  763          */
  764         if (behav == MADV_PROTECT) {
  765                 flags = PPROT_SET;
  766                 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
  767                     PROC_SPROTECT, &flags));
  768         }
  769 
  770         /*
  771          * Check for illegal addresses.  Watch out for address wrap... Note
  772          * that VM_*_ADDRESS are not constants due to casts (argh).
  773          */
  774         map = &td->td_proc->p_vmspace->vm_map;
  775         addr = addr0;
  776         if (!vm_map_range_valid(map, addr, addr + len))
  777                 return (EINVAL);
  778 
  779         /*
  780          * Since this routine is only advisory, we default to conservative
  781          * behavior.
  782          */
  783         start = trunc_page(addr);
  784         end = round_page(addr + len);
  785 
  786         /*
  787          * vm_map_madvise() checks for illegal values of behav.
  788          */
  789         return (vm_map_madvise(map, start, end, behav));
  790 }
  791 
  792 #ifndef _SYS_SYSPROTO_H_
  793 struct mincore_args {
  794         const void *addr;
  795         size_t len;
  796         char *vec;
  797 };
  798 #endif
  799 
  800 int
  801 sys_mincore(struct thread *td, struct mincore_args *uap)
  802 {
  803 
  804         return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec));
  805 }
  806 
  807 int
  808 kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec)
  809 {
  810         pmap_t pmap;
  811         vm_map_t map;
  812         vm_map_entry_t current, entry;
  813         vm_object_t object;
  814         vm_offset_t addr, cend, end, first_addr;
  815         vm_paddr_t pa;
  816         vm_page_t m;
  817         vm_pindex_t pindex;
  818         int error, lastvecindex, mincoreinfo, vecindex;
  819         unsigned int timestamp;
  820 
  821         /*
  822          * Make sure that the addresses presented are valid for user
  823          * mode.
  824          */
  825         first_addr = addr = trunc_page(addr0);
  826         end = round_page(addr0 + len);
  827         map = &td->td_proc->p_vmspace->vm_map;
  828         if (end > vm_map_max(map) || end < addr)
  829                 return (ENOMEM);
  830 
  831         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  832 
  833         vm_map_lock_read(map);
  834 RestartScan:
  835         timestamp = map->timestamp;
  836 
  837         if (!vm_map_lookup_entry(map, addr, &entry)) {
  838                 vm_map_unlock_read(map);
  839                 return (ENOMEM);
  840         }
  841 
  842         /*
  843          * Do this on a map entry basis so that if the pages are not
  844          * in the current processes address space, we can easily look
  845          * up the pages elsewhere.
  846          */
  847         lastvecindex = -1;
  848         while (entry->start < end) {
  849                 /*
  850                  * check for contiguity
  851                  */
  852                 current = entry;
  853                 entry = vm_map_entry_succ(current);
  854                 if (current->end < end &&
  855                     entry->start > current->end) {
  856                         vm_map_unlock_read(map);
  857                         return (ENOMEM);
  858                 }
  859 
  860                 /*
  861                  * ignore submaps (for now) or null objects
  862                  */
  863                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  864                     current->object.vm_object == NULL)
  865                         continue;
  866 
  867                 /*
  868                  * limit this scan to the current map entry and the
  869                  * limits for the mincore call
  870                  */
  871                 if (addr < current->start)
  872                         addr = current->start;
  873                 cend = current->end;
  874                 if (cend > end)
  875                         cend = end;
  876 
  877                 for (; addr < cend; addr += PAGE_SIZE) {
  878                         /*
  879                          * Check pmap first, it is likely faster, also
  880                          * it can provide info as to whether we are the
  881                          * one referencing or modifying the page.
  882                          */
  883                         m = NULL;
  884                         object = NULL;
  885 retry:
  886                         pa = 0;
  887                         mincoreinfo = pmap_mincore(pmap, addr, &pa);
  888                         if (mincore_mapped) {
  889                                 /*
  890                                  * We only care about this pmap's
  891                                  * mapping of the page, if any.
  892                                  */
  893                                 ;
  894                         } else if (pa != 0) {
  895                                 /*
  896                                  * The page is mapped by this process but not
  897                                  * both accessed and modified.  It is also
  898                                  * managed.  Acquire the object lock so that
  899                                  * other mappings might be examined.  The page's
  900                                  * identity may change at any point before its
  901                                  * object lock is acquired, so re-validate if
  902                                  * necessary.
  903                                  */
  904                                 m = PHYS_TO_VM_PAGE(pa);
  905                                 while (object == NULL || m->object != object) {
  906                                         if (object != NULL)
  907                                                 VM_OBJECT_WUNLOCK(object);
  908                                         object = atomic_load_ptr(&m->object);
  909                                         if (object == NULL)
  910                                                 goto retry;
  911                                         VM_OBJECT_WLOCK(object);
  912                                 }
  913                                 if (pa != pmap_extract(pmap, addr))
  914                                         goto retry;
  915                                 KASSERT(vm_page_all_valid(m),
  916                                     ("mincore: page %p is mapped but invalid",
  917                                     m));
  918                         } else if (mincoreinfo == 0) {
  919                                 /*
  920                                  * The page is not mapped by this process.  If
  921                                  * the object implements managed pages, then
  922                                  * determine if the page is resident so that
  923                                  * the mappings might be examined.
  924                                  */
  925                                 if (current->object.vm_object != object) {
  926                                         if (object != NULL)
  927                                                 VM_OBJECT_WUNLOCK(object);
  928                                         object = current->object.vm_object;
  929                                         VM_OBJECT_WLOCK(object);
  930                                 }
  931                                 if (object->type == OBJT_DEFAULT ||
  932                                     object->type == OBJT_SWAP ||
  933                                     object->type == OBJT_VNODE) {
  934                                         pindex = OFF_TO_IDX(current->offset +
  935                                             (addr - current->start));
  936                                         m = vm_page_lookup(object, pindex);
  937                                         if (m != NULL && vm_page_none_valid(m))
  938                                                 m = NULL;
  939                                         if (m != NULL)
  940                                                 mincoreinfo = MINCORE_INCORE;
  941                                 }
  942                         }
  943                         if (m != NULL) {
  944                                 VM_OBJECT_ASSERT_WLOCKED(m->object);
  945 
  946                                 /* Examine other mappings of the page. */
  947                                 if (m->dirty == 0 && pmap_is_modified(m))
  948                                         vm_page_dirty(m);
  949                                 if (m->dirty != 0)
  950                                         mincoreinfo |= MINCORE_MODIFIED_OTHER;
  951 
  952                                 /*
  953                                  * The first test for PGA_REFERENCED is an
  954                                  * optimization.  The second test is
  955                                  * required because a concurrent pmap
  956                                  * operation could clear the last reference
  957                                  * and set PGA_REFERENCED before the call to
  958                                  * pmap_is_referenced(). 
  959                                  */
  960                                 if ((m->a.flags & PGA_REFERENCED) != 0 ||
  961                                     pmap_is_referenced(m) ||
  962                                     (m->a.flags & PGA_REFERENCED) != 0)
  963                                         mincoreinfo |= MINCORE_REFERENCED_OTHER;
  964                         }
  965                         if (object != NULL)
  966                                 VM_OBJECT_WUNLOCK(object);
  967 
  968                         /*
  969                          * subyte may page fault.  In case it needs to modify
  970                          * the map, we release the lock.
  971                          */
  972                         vm_map_unlock_read(map);
  973 
  974                         /*
  975                          * calculate index into user supplied byte vector
  976                          */
  977                         vecindex = atop(addr - first_addr);
  978 
  979                         /*
  980                          * If we have skipped map entries, we need to make sure that
  981                          * the byte vector is zeroed for those skipped entries.
  982                          */
  983                         while ((lastvecindex + 1) < vecindex) {
  984                                 ++lastvecindex;
  985                                 error = subyte(vec + lastvecindex, 0);
  986                                 if (error) {
  987                                         error = EFAULT;
  988                                         goto done2;
  989                                 }
  990                         }
  991 
  992                         /*
  993                          * Pass the page information to the user
  994                          */
  995                         error = subyte(vec + vecindex, mincoreinfo);
  996                         if (error) {
  997                                 error = EFAULT;
  998                                 goto done2;
  999                         }
 1000 
 1001                         /*
 1002                          * If the map has changed, due to the subyte, the previous
 1003                          * output may be invalid.
 1004                          */
 1005                         vm_map_lock_read(map);
 1006                         if (timestamp != map->timestamp)
 1007                                 goto RestartScan;
 1008 
 1009                         lastvecindex = vecindex;
 1010                 }
 1011         }
 1012 
 1013         /*
 1014          * subyte may page fault.  In case it needs to modify
 1015          * the map, we release the lock.
 1016          */
 1017         vm_map_unlock_read(map);
 1018 
 1019         /*
 1020          * Zero the last entries in the byte vector.
 1021          */
 1022         vecindex = atop(end - first_addr);
 1023         while ((lastvecindex + 1) < vecindex) {
 1024                 ++lastvecindex;
 1025                 error = subyte(vec + lastvecindex, 0);
 1026                 if (error) {
 1027                         error = EFAULT;
 1028                         goto done2;
 1029                 }
 1030         }
 1031 
 1032         /*
 1033          * If the map has changed, due to the subyte, the previous
 1034          * output may be invalid.
 1035          */
 1036         vm_map_lock_read(map);
 1037         if (timestamp != map->timestamp)
 1038                 goto RestartScan;
 1039         vm_map_unlock_read(map);
 1040 done2:
 1041         return (error);
 1042 }
 1043 
 1044 #ifndef _SYS_SYSPROTO_H_
 1045 struct mlock_args {
 1046         const void *addr;
 1047         size_t len;
 1048 };
 1049 #endif
 1050 int
 1051 sys_mlock(struct thread *td, struct mlock_args *uap)
 1052 {
 1053 
 1054         return (kern_mlock(td->td_proc, td->td_ucred,
 1055             __DECONST(uintptr_t, uap->addr), uap->len));
 1056 }
 1057 
 1058 int
 1059 kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len)
 1060 {
 1061         vm_offset_t addr, end, last, start;
 1062         vm_size_t npages, size;
 1063         vm_map_t map;
 1064         unsigned long nsize;
 1065         int error;
 1066 
 1067         error = priv_check_cred(cred, PRIV_VM_MLOCK);
 1068         if (error)
 1069                 return (error);
 1070         addr = addr0;
 1071         size = len;
 1072         last = addr + size;
 1073         start = trunc_page(addr);
 1074         end = round_page(last);
 1075         if (last < addr || end < addr)
 1076                 return (EINVAL);
 1077         npages = atop(end - start);
 1078         if (npages > vm_page_max_user_wired)
 1079                 return (ENOMEM);
 1080         map = &proc->p_vmspace->vm_map;
 1081         PROC_LOCK(proc);
 1082         nsize = ptoa(npages + pmap_wired_count(map->pmap));
 1083         if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) {
 1084                 PROC_UNLOCK(proc);
 1085                 return (ENOMEM);
 1086         }
 1087         PROC_UNLOCK(proc);
 1088 #ifdef RACCT
 1089         if (racct_enable) {
 1090                 PROC_LOCK(proc);
 1091                 error = racct_set(proc, RACCT_MEMLOCK, nsize);
 1092                 PROC_UNLOCK(proc);
 1093                 if (error != 0)
 1094                         return (ENOMEM);
 1095         }
 1096 #endif
 1097         error = vm_map_wire(map, start, end,
 1098             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1099 #ifdef RACCT
 1100         if (racct_enable && error != KERN_SUCCESS) {
 1101                 PROC_LOCK(proc);
 1102                 racct_set(proc, RACCT_MEMLOCK,
 1103                     ptoa(pmap_wired_count(map->pmap)));
 1104                 PROC_UNLOCK(proc);
 1105         }
 1106 #endif
 1107         switch (error) {
 1108         case KERN_SUCCESS:
 1109                 return (0);
 1110         case KERN_INVALID_ARGUMENT:
 1111                 return (EINVAL);
 1112         default:
 1113                 return (ENOMEM);
 1114         }
 1115 }
 1116 
 1117 #ifndef _SYS_SYSPROTO_H_
 1118 struct mlockall_args {
 1119         int     how;
 1120 };
 1121 #endif
 1122 
 1123 int
 1124 sys_mlockall(struct thread *td, struct mlockall_args *uap)
 1125 {
 1126         vm_map_t map;
 1127         int error;
 1128 
 1129         map = &td->td_proc->p_vmspace->vm_map;
 1130         error = priv_check(td, PRIV_VM_MLOCK);
 1131         if (error)
 1132                 return (error);
 1133 
 1134         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 1135                 return (EINVAL);
 1136 
 1137         /*
 1138          * If wiring all pages in the process would cause it to exceed
 1139          * a hard resource limit, return ENOMEM.
 1140          */
 1141         if (!old_mlock && uap->how & MCL_CURRENT) {
 1142                 if (map->size > lim_cur(td, RLIMIT_MEMLOCK))
 1143                         return (ENOMEM);
 1144         }
 1145 #ifdef RACCT
 1146         if (racct_enable) {
 1147                 PROC_LOCK(td->td_proc);
 1148                 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 1149                 PROC_UNLOCK(td->td_proc);
 1150                 if (error != 0)
 1151                         return (ENOMEM);
 1152         }
 1153 #endif
 1154 
 1155         if (uap->how & MCL_FUTURE) {
 1156                 vm_map_lock(map);
 1157                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
 1158                 vm_map_unlock(map);
 1159                 error = 0;
 1160         }
 1161 
 1162         if (uap->how & MCL_CURRENT) {
 1163                 /*
 1164                  * P1003.1-2001 mandates that all currently mapped pages
 1165                  * will be memory resident and locked (wired) upon return
 1166                  * from mlockall(). vm_map_wire() will wire pages, by
 1167                  * calling vm_fault_wire() for each page in the region.
 1168                  */
 1169                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 1170                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1171                 if (error == KERN_SUCCESS)
 1172                         error = 0;
 1173                 else if (error == KERN_RESOURCE_SHORTAGE)
 1174                         error = ENOMEM;
 1175                 else
 1176                         error = EAGAIN;
 1177         }
 1178 #ifdef RACCT
 1179         if (racct_enable && error != KERN_SUCCESS) {
 1180                 PROC_LOCK(td->td_proc);
 1181                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1182                     ptoa(pmap_wired_count(map->pmap)));
 1183                 PROC_UNLOCK(td->td_proc);
 1184         }
 1185 #endif
 1186 
 1187         return (error);
 1188 }
 1189 
 1190 #ifndef _SYS_SYSPROTO_H_
 1191 struct munlockall_args {
 1192         register_t dummy;
 1193 };
 1194 #endif
 1195 
 1196 int
 1197 sys_munlockall(struct thread *td, struct munlockall_args *uap)
 1198 {
 1199         vm_map_t map;
 1200         int error;
 1201 
 1202         map = &td->td_proc->p_vmspace->vm_map;
 1203         error = priv_check(td, PRIV_VM_MUNLOCK);
 1204         if (error)
 1205                 return (error);
 1206 
 1207         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1208         vm_map_lock(map);
 1209         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1210         vm_map_unlock(map);
 1211 
 1212         /* Forcibly unwire all pages. */
 1213         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1214             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1215 #ifdef RACCT
 1216         if (racct_enable && error == KERN_SUCCESS) {
 1217                 PROC_LOCK(td->td_proc);
 1218                 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 1219                 PROC_UNLOCK(td->td_proc);
 1220         }
 1221 #endif
 1222 
 1223         return (error);
 1224 }
 1225 
 1226 #ifndef _SYS_SYSPROTO_H_
 1227 struct munlock_args {
 1228         const void *addr;
 1229         size_t len;
 1230 };
 1231 #endif
 1232 int
 1233 sys_munlock(struct thread *td, struct munlock_args *uap)
 1234 {
 1235 
 1236         return (kern_munlock(td, (uintptr_t)uap->addr, uap->len));
 1237 }
 1238 
 1239 int
 1240 kern_munlock(struct thread *td, uintptr_t addr0, size_t size)
 1241 {
 1242         vm_offset_t addr, end, last, start;
 1243 #ifdef RACCT
 1244         vm_map_t map;
 1245 #endif
 1246         int error;
 1247 
 1248         error = priv_check(td, PRIV_VM_MUNLOCK);
 1249         if (error)
 1250                 return (error);
 1251         addr = addr0;
 1252         last = addr + size;
 1253         start = trunc_page(addr);
 1254         end = round_page(last);
 1255         if (last < addr || end < addr)
 1256                 return (EINVAL);
 1257         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 1258             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1259 #ifdef RACCT
 1260         if (racct_enable && error == KERN_SUCCESS) {
 1261                 PROC_LOCK(td->td_proc);
 1262                 map = &td->td_proc->p_vmspace->vm_map;
 1263                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1264                     ptoa(pmap_wired_count(map->pmap)));
 1265                 PROC_UNLOCK(td->td_proc);
 1266         }
 1267 #endif
 1268         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1269 }
 1270 
 1271 /*
 1272  * vm_mmap_vnode()
 1273  *
 1274  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1275  * operations on vnodes.
 1276  */
 1277 int
 1278 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 1279     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1280     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
 1281     boolean_t *writecounted)
 1282 {
 1283         struct vattr va;
 1284         vm_object_t obj;
 1285         vm_ooffset_t foff;
 1286         struct ucred *cred;
 1287         int error, flags;
 1288         bool writex;
 1289 
 1290         cred = td->td_ucred;
 1291         writex = (*maxprotp & VM_PROT_WRITE) != 0 &&
 1292             (*flagsp & MAP_SHARED) != 0;
 1293         if ((error = vget(vp, LK_SHARED)) != 0)
 1294                 return (error);
 1295         AUDIT_ARG_VNODE1(vp);
 1296         foff = *foffp;
 1297         flags = *flagsp;
 1298         obj = vp->v_object;
 1299         if (vp->v_type == VREG) {
 1300                 /*
 1301                  * Get the proper underlying object
 1302                  */
 1303                 if (obj == NULL) {
 1304                         error = EINVAL;
 1305                         goto done;
 1306                 }
 1307                 if (obj->type == OBJT_VNODE && obj->handle != vp) {
 1308                         vput(vp);
 1309                         vp = (struct vnode *)obj->handle;
 1310                         /*
 1311                          * Bypass filesystems obey the mpsafety of the
 1312                          * underlying fs.  Tmpfs never bypasses.
 1313                          */
 1314                         error = vget(vp, LK_SHARED);
 1315                         if (error != 0)
 1316                                 return (error);
 1317                 }
 1318                 if (writex) {
 1319                         *writecounted = TRUE;
 1320                         vm_pager_update_writecount(obj, 0, objsize);
 1321                 }
 1322         } else {
 1323                 error = EINVAL;
 1324                 goto done;
 1325         }
 1326         if ((error = VOP_GETATTR(vp, &va, cred)))
 1327                 goto done;
 1328 #ifdef MAC
 1329         /* This relies on VM_PROT_* matching PROT_*. */
 1330         error = mac_vnode_check_mmap(cred, vp, (int)prot, flags);
 1331         if (error != 0)
 1332                 goto done;
 1333 #endif
 1334         if ((flags & MAP_SHARED) != 0) {
 1335                 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 1336                         if (prot & VM_PROT_WRITE) {
 1337                                 error = EPERM;
 1338                                 goto done;
 1339                         }
 1340                         *maxprotp &= ~VM_PROT_WRITE;
 1341                 }
 1342         }
 1343         /*
 1344          * If it is a regular file without any references
 1345          * we do not need to sync it.
 1346          * Adjust object size to be the size of actual file.
 1347          */
 1348         objsize = round_page(va.va_size);
 1349         if (va.va_nlink == 0)
 1350                 flags |= MAP_NOSYNC;
 1351         if (obj->type == OBJT_VNODE) {
 1352                 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 1353                     cred);
 1354                 if (obj == NULL) {
 1355                         error = ENOMEM;
 1356                         goto done;
 1357                 }
 1358         } else {
 1359                 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 1360                     ("wrong object type"));
 1361                 vm_object_reference(obj);
 1362 #if VM_NRESERVLEVEL > 0
 1363                 if ((obj->flags & OBJ_COLORED) == 0) {
 1364                         VM_OBJECT_WLOCK(obj);
 1365                         vm_object_color(obj, 0);
 1366                         VM_OBJECT_WUNLOCK(obj);
 1367                 }
 1368 #endif
 1369         }
 1370         *objp = obj;
 1371         *flagsp = flags;
 1372 
 1373         VOP_MMAPPED(vp);
 1374 
 1375 done:
 1376         if (error != 0 && *writecounted) {
 1377                 *writecounted = FALSE;
 1378                 vm_pager_update_writecount(obj, objsize, 0);
 1379         }
 1380         vput(vp);
 1381         return (error);
 1382 }
 1383 
 1384 /*
 1385  * vm_mmap_cdev()
 1386  *
 1387  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1388  * operations on cdevs.
 1389  */
 1390 int
 1391 vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot,
 1392     vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw,
 1393     vm_ooffset_t *foff, vm_object_t *objp)
 1394 {
 1395         vm_object_t obj;
 1396         int error, flags;
 1397 
 1398         flags = *flagsp;
 1399 
 1400         if (dsw->d_flags & D_MMAP_ANON) {
 1401                 *objp = NULL;
 1402                 *foff = 0;
 1403                 *maxprotp = VM_PROT_ALL;
 1404                 *flagsp |= MAP_ANON;
 1405                 return (0);
 1406         }
 1407         /*
 1408          * cdevs do not provide private mappings of any kind.
 1409          */
 1410         if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 1411             (prot & VM_PROT_WRITE) != 0)
 1412                 return (EACCES);
 1413         if (flags & (MAP_PRIVATE|MAP_COPY))
 1414                 return (EINVAL);
 1415         /*
 1416          * Force device mappings to be shared.
 1417          */
 1418         flags |= MAP_SHARED;
 1419 #ifdef MAC_XXX
 1420         error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot);
 1421         if (error != 0)
 1422                 return (error);
 1423 #endif
 1424         /*
 1425          * First, try d_mmap_single().  If that is not implemented
 1426          * (returns ENODEV), fall back to using the device pager.
 1427          * Note that d_mmap_single() must return a reference to the
 1428          * object (it needs to bump the reference count of the object
 1429          * it returns somehow).
 1430          *
 1431          * XXX assumes VM_PROT_* == PROT_*
 1432          */
 1433         error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 1434         if (error != ENODEV)
 1435                 return (error);
 1436         obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 1437             td->td_ucred);
 1438         if (obj == NULL)
 1439                 return (EINVAL);
 1440         *objp = obj;
 1441         *flagsp = flags;
 1442         return (0);
 1443 }
 1444 
 1445 /*
 1446  * vm_mmap()
 1447  *
 1448  * Internal version of mmap used by exec, sys5 shared memory, and
 1449  * various device drivers.  Handle is either a vnode pointer, a
 1450  * character device, or NULL for MAP_ANON.
 1451  */
 1452 int
 1453 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1454         vm_prot_t maxprot, int flags,
 1455         objtype_t handle_type, void *handle,
 1456         vm_ooffset_t foff)
 1457 {
 1458         vm_object_t object;
 1459         struct thread *td = curthread;
 1460         int error;
 1461         boolean_t writecounted;
 1462 
 1463         if (size == 0)
 1464                 return (EINVAL);
 1465 
 1466         size = round_page(size);
 1467         object = NULL;
 1468         writecounted = FALSE;
 1469 
 1470         /*
 1471          * Lookup/allocate object.
 1472          */
 1473         switch (handle_type) {
 1474         case OBJT_DEVICE: {
 1475                 struct cdevsw *dsw;
 1476                 struct cdev *cdev;
 1477                 int ref;
 1478 
 1479                 cdev = handle;
 1480                 dsw = dev_refthread(cdev, &ref);
 1481                 if (dsw == NULL)
 1482                         return (ENXIO);
 1483                 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev,
 1484                     dsw, &foff, &object);
 1485                 dev_relthread(cdev, ref);
 1486                 break;
 1487         }
 1488         case OBJT_VNODE:
 1489                 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 1490                     handle, &foff, &object, &writecounted);
 1491                 break;
 1492         case OBJT_DEFAULT:
 1493                 if (handle == NULL) {
 1494                         error = 0;
 1495                         break;
 1496                 }
 1497                 /* FALLTHROUGH */
 1498         default:
 1499                 error = EINVAL;
 1500                 break;
 1501         }
 1502         if (error)
 1503                 return (error);
 1504 
 1505         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 1506             foff, writecounted, td);
 1507         if (error != 0 && object != NULL) {
 1508                 /*
 1509                  * If this mapping was accounted for in the vnode's
 1510                  * writecount, then undo that now.
 1511                  */
 1512                 if (writecounted)
 1513                         vm_pager_release_writecount(object, 0, size);
 1514                 vm_object_deallocate(object);
 1515         }
 1516         return (error);
 1517 }
 1518 
 1519 int
 1520 kern_mmap_racct_check(struct thread *td, vm_map_t map, vm_size_t size)
 1521 {
 1522         int error;
 1523 
 1524         RACCT_PROC_LOCK(td->td_proc);
 1525         if (map->size + size > lim_cur(td, RLIMIT_VMEM)) {
 1526                 RACCT_PROC_UNLOCK(td->td_proc);
 1527                 return (ENOMEM);
 1528         }
 1529         if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 1530                 RACCT_PROC_UNLOCK(td->td_proc);
 1531                 return (ENOMEM);
 1532         }
 1533         if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 1534                 if (ptoa(pmap_wired_count(map->pmap)) + size >
 1535                     lim_cur(td, RLIMIT_MEMLOCK)) {
 1536                         racct_set_force(td->td_proc, RACCT_VMEM, map->size);
 1537                         RACCT_PROC_UNLOCK(td->td_proc);
 1538                         return (ENOMEM);
 1539                 }
 1540                 error = racct_set(td->td_proc, RACCT_MEMLOCK,
 1541                     ptoa(pmap_wired_count(map->pmap)) + size);
 1542                 if (error != 0) {
 1543                         racct_set_force(td->td_proc, RACCT_VMEM, map->size);
 1544                         RACCT_PROC_UNLOCK(td->td_proc);
 1545                         return (error);
 1546                 }
 1547         }
 1548         RACCT_PROC_UNLOCK(td->td_proc);
 1549         return (0);
 1550 }
 1551 
 1552 /*
 1553  * Internal version of mmap that maps a specific VM object into an
 1554  * map.  Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap.
 1555  */
 1556 int
 1557 vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1558     vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff,
 1559     boolean_t writecounted, struct thread *td)
 1560 {
 1561         vm_offset_t max_addr;
 1562         int docow, error, findspace, rv;
 1563         bool curmap, fitit;
 1564 
 1565         curmap = map == &td->td_proc->p_vmspace->vm_map;
 1566         if (curmap) {
 1567                 error = kern_mmap_racct_check(td, map, size);
 1568                 if (error != 0)
 1569                         return (error);
 1570         }
 1571 
 1572         /*
 1573          * We currently can only deal with page aligned file offsets.
 1574          * The mmap() system call already enforces this by subtracting
 1575          * the page offset from the file offset, but checking here
 1576          * catches errors in device drivers (e.g. d_single_mmap()
 1577          * callbacks) and other internal mapping requests (such as in
 1578          * exec).
 1579          */
 1580         if (foff & PAGE_MASK)
 1581                 return (EINVAL);
 1582 
 1583         if ((flags & MAP_FIXED) == 0) {
 1584                 fitit = TRUE;
 1585                 *addr = round_page(*addr);
 1586         } else {
 1587                 if (*addr != trunc_page(*addr))
 1588                         return (EINVAL);
 1589                 fitit = FALSE;
 1590         }
 1591 
 1592         if (flags & MAP_ANON) {
 1593                 if (object != NULL || foff != 0)
 1594                         return (EINVAL);
 1595                 docow = 0;
 1596         } else if (flags & MAP_PREFAULT_READ)
 1597                 docow = MAP_PREFAULT;
 1598         else
 1599                 docow = MAP_PREFAULT_PARTIAL;
 1600 
 1601         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1602                 docow |= MAP_COPY_ON_WRITE;
 1603         if (flags & MAP_NOSYNC)
 1604                 docow |= MAP_DISABLE_SYNCER;
 1605         if (flags & MAP_NOCORE)
 1606                 docow |= MAP_DISABLE_COREDUMP;
 1607         /* Shared memory is also shared with children. */
 1608         if (flags & MAP_SHARED)
 1609                 docow |= MAP_INHERIT_SHARE;
 1610         if (writecounted)
 1611                 docow |= MAP_WRITECOUNT;
 1612         if (flags & MAP_STACK) {
 1613                 if (object != NULL)
 1614                         return (EINVAL);
 1615                 docow |= MAP_STACK_GROWS_DOWN;
 1616         }
 1617         if ((flags & MAP_EXCL) != 0)
 1618                 docow |= MAP_CHECK_EXCL;
 1619         if ((flags & MAP_GUARD) != 0)
 1620                 docow |= MAP_CREATE_GUARD;
 1621 
 1622         if (fitit) {
 1623                 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 1624                         findspace = VMFS_SUPER_SPACE;
 1625                 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 1626                         findspace = VMFS_ALIGNED_SPACE(flags >>
 1627                             MAP_ALIGNMENT_SHIFT);
 1628                 else
 1629                         findspace = VMFS_OPTIMAL_SPACE;
 1630                 max_addr = 0;
 1631 #ifdef MAP_32BIT
 1632                 if ((flags & MAP_32BIT) != 0)
 1633                         max_addr = MAP_32BIT_MAX_ADDR;
 1634 #endif
 1635                 if (curmap) {
 1636                         rv = vm_map_find_min(map, object, foff, addr, size,
 1637                             round_page((vm_offset_t)td->td_proc->p_vmspace->
 1638                             vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr,
 1639                             findspace, prot, maxprot, docow);
 1640                 } else {
 1641                         rv = vm_map_find(map, object, foff, addr, size,
 1642                             max_addr, findspace, prot, maxprot, docow);
 1643                 }
 1644         } else {
 1645                 rv = vm_map_fixed(map, object, foff, *addr, size,
 1646                     prot, maxprot, docow);
 1647         }
 1648 
 1649         if (rv == KERN_SUCCESS) {
 1650                 /*
 1651                  * If the process has requested that all future mappings
 1652                  * be wired, then heed this.
 1653                  */
 1654                 if ((map->flags & MAP_WIREFUTURE) != 0) {
 1655                         vm_map_lock(map);
 1656                         if ((map->flags & MAP_WIREFUTURE) != 0)
 1657                                 (void)vm_map_wire_locked(map, *addr,
 1658                                     *addr + size, VM_MAP_WIRE_USER |
 1659                                     ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK :
 1660                                     VM_MAP_WIRE_NOHOLES));
 1661                         vm_map_unlock(map);
 1662                 }
 1663         }
 1664         return (vm_mmap_to_errno(rv));
 1665 }
 1666 
 1667 /*
 1668  * Translate a Mach VM return code to zero on success or the appropriate errno
 1669  * on failure.
 1670  */
 1671 int
 1672 vm_mmap_to_errno(int rv)
 1673 {
 1674 
 1675         switch (rv) {
 1676         case KERN_SUCCESS:
 1677                 return (0);
 1678         case KERN_INVALID_ADDRESS:
 1679         case KERN_NO_SPACE:
 1680                 return (ENOMEM);
 1681         case KERN_PROTECTION_FAILURE:
 1682                 return (EACCES);
 1683         default:
 1684                 return (EINVAL);
 1685         }
 1686 }

Cache object: 6e435e4bb6819f89d9dfbe201186800b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.