The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1988 University of Utah.
    3  * Copyright (c) 1991, 1993
    4  *      The Regents of the University of California.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * the Systems Programming Group of the University of Utah Computer
    8  * Science Department.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   35  *
   36  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   37  */
   38 
   39 /*
   40  * Mapped file (mmap) interface to VM
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD$");
   45 
   46 #include "opt_compat.h"
   47 #include "opt_hwpmc_hooks.h"
   48 
   49 #include <sys/param.h>
   50 #include <sys/systm.h>
   51 #include <sys/capsicum.h>
   52 #include <sys/kernel.h>
   53 #include <sys/lock.h>
   54 #include <sys/mutex.h>
   55 #include <sys/sysproto.h>
   56 #include <sys/filedesc.h>
   57 #include <sys/priv.h>
   58 #include <sys/proc.h>
   59 #include <sys/procctl.h>
   60 #include <sys/racct.h>
   61 #include <sys/resource.h>
   62 #include <sys/resourcevar.h>
   63 #include <sys/rwlock.h>
   64 #include <sys/sysctl.h>
   65 #include <sys/vnode.h>
   66 #include <sys/fcntl.h>
   67 #include <sys/file.h>
   68 #include <sys/mman.h>
   69 #include <sys/mount.h>
   70 #include <sys/conf.h>
   71 #include <sys/stat.h>
   72 #include <sys/syscallsubr.h>
   73 #include <sys/sysent.h>
   74 #include <sys/vmmeter.h>
   75 
   76 #include <security/mac/mac_framework.h>
   77 
   78 #include <vm/vm.h>
   79 #include <vm/vm_param.h>
   80 #include <vm/pmap.h>
   81 #include <vm/vm_map.h>
   82 #include <vm/vm_object.h>
   83 #include <vm/vm_page.h>
   84 #include <vm/vm_pager.h>
   85 #include <vm/vm_pageout.h>
   86 #include <vm/vm_extern.h>
   87 #include <vm/vm_page.h>
   88 #include <vm/vnode_pager.h>
   89 
   90 #ifdef HWPMC_HOOKS
   91 #include <sys/pmckern.h>
   92 #endif
   93 
   94 int old_mlock = 0;
   95 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
   96     "Do not apply RLIMIT_MEMLOCK on mlockall");
   97 TUNABLE_INT("vm.old_mlock", &old_mlock);
   98 
   99 #ifdef MAP_32BIT
  100 #define MAP_32BIT_MAX_ADDR      ((vm_offset_t)1 << 31)
  101 #endif
  102 
  103 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  104     int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
  105 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  106     int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
  107 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  108     int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
  109 
  110 #ifndef _SYS_SYSPROTO_H_
  111 struct sbrk_args {
  112         int incr;
  113 };
  114 #endif
  115 
  116 int
  117 sys_sbrk(struct thread *td, struct sbrk_args *uap)
  118 {
  119         /* Not yet implemented */
  120         return (EOPNOTSUPP);
  121 }
  122 
  123 #ifndef _SYS_SYSPROTO_H_
  124 struct sstk_args {
  125         int incr;
  126 };
  127 #endif
  128 
  129 int
  130 sys_sstk(struct thread *td, struct sstk_args *uap)
  131 {
  132         /* Not yet implemented */
  133         return (EOPNOTSUPP);
  134 }
  135 
  136 #if defined(COMPAT_43)
  137 #ifndef _SYS_SYSPROTO_H_
  138 struct getpagesize_args {
  139         int dummy;
  140 };
  141 #endif
  142 
  143 int
  144 ogetpagesize(struct thread *td, struct getpagesize_args *uap)
  145 {
  146 
  147         td->td_retval[0] = PAGE_SIZE;
  148         return (0);
  149 }
  150 #endif                          /* COMPAT_43 */
  151 
  152 
  153 /*
  154  * Memory Map (mmap) system call.  Note that the file offset
  155  * and address are allowed to be NOT page aligned, though if
  156  * the MAP_FIXED flag it set, both must have the same remainder
  157  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  158  * page-aligned, the actual mapping starts at trunc_page(addr)
  159  * and the return value is adjusted up by the page offset.
  160  *
  161  * Generally speaking, only character devices which are themselves
  162  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  163  * there would be no cache coherency between a descriptor and a VM mapping
  164  * both to the same character device.
  165  */
  166 #ifndef _SYS_SYSPROTO_H_
  167 struct mmap_args {
  168         void *addr;
  169         size_t len;
  170         int prot;
  171         int flags;
  172         int fd;
  173         long pad;
  174         off_t pos;
  175 };
  176 #endif
  177 
  178 int
  179 sys_mmap(td, uap)
  180         struct thread *td;
  181         struct mmap_args *uap;
  182 {
  183 #ifdef HWPMC_HOOKS
  184         struct pmckern_map_in pkm;
  185 #endif
  186         struct file *fp;
  187         struct vnode *vp;
  188         vm_offset_t addr;
  189         vm_size_t size, pageoff;
  190         vm_prot_t cap_maxprot, prot, maxprot;
  191         void *handle;
  192         objtype_t handle_type;
  193         int align, error, flags;
  194         off_t pos;
  195         struct vmspace *vms = td->td_proc->p_vmspace;
  196         cap_rights_t rights;
  197 
  198         addr = (vm_offset_t) uap->addr;
  199         size = uap->len;
  200         prot = uap->prot & VM_PROT_ALL;
  201         flags = uap->flags;
  202         pos = uap->pos;
  203 
  204         fp = NULL;
  205 
  206         /*
  207          * Enforce the constraints.
  208          * Mapping of length 0 is only allowed for old binaries.
  209          * Anonymous mapping shall specify -1 as filedescriptor and
  210          * zero position for new code. Be nice to ancient a.out
  211          * binaries and correct pos for anonymous mapping, since old
  212          * ld.so sometimes issues anonymous map requests with non-zero
  213          * pos.
  214          */
  215         if (!SV_CURPROC_FLAG(SV_AOUT)) {
  216                 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
  217                     ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
  218                         return (EINVAL);
  219         } else {
  220                 if ((flags & MAP_ANON) != 0)
  221                         pos = 0;
  222         }
  223 
  224         if (flags & MAP_STACK) {
  225                 if ((uap->fd != -1) ||
  226                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  227                         return (EINVAL);
  228                 flags |= MAP_ANON;
  229                 pos = 0;
  230         }
  231         if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL)
  232                 return (EINVAL);
  233         if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || uap->fd != -1 ||
  234             pos != 0 || (flags & (MAP_SHARED | MAP_PRIVATE | MAP_PREFAULT |
  235             MAP_PREFAULT_READ | MAP_ANON | MAP_STACK)) != 0))
  236                 return (EINVAL);
  237 
  238         /*
  239          * Align the file position to a page boundary,
  240          * and save its page offset component.
  241          */
  242         pageoff = (pos & PAGE_MASK);
  243         pos -= pageoff;
  244 
  245         /* Adjust size for rounding (on both ends). */
  246         size += pageoff;                        /* low end... */
  247         size = (vm_size_t) round_page(size);    /* hi end */
  248 
  249         /* Ensure alignment is at least a page and fits in a pointer. */
  250         align = flags & MAP_ALIGNMENT_MASK;
  251         if (align != 0 && align != MAP_ALIGNED_SUPER &&
  252             (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY ||
  253             align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT))
  254                 return (EINVAL);
  255 
  256         /*
  257          * Check for illegal addresses.  Watch out for address wrap... Note
  258          * that VM_*_ADDRESS are not constants due to casts (argh).
  259          */
  260         if (flags & MAP_FIXED) {
  261                 /*
  262                  * The specified address must have the same remainder
  263                  * as the file offset taken modulo PAGE_SIZE, so it
  264                  * should be aligned after adjustment by pageoff.
  265                  */
  266                 addr -= pageoff;
  267                 if (addr & PAGE_MASK)
  268                         return (EINVAL);
  269 
  270                 /* Address range must be all in user VM space. */
  271                 if (addr < vm_map_min(&vms->vm_map) ||
  272                     addr + size > vm_map_max(&vms->vm_map))
  273                         return (EINVAL);
  274                 if (addr + size < addr)
  275                         return (EINVAL);
  276 #ifdef MAP_32BIT
  277                 if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR)
  278                         return (EINVAL);
  279         } else if (flags & MAP_32BIT) {
  280                 /*
  281                  * For MAP_32BIT, override the hint if it is too high and
  282                  * do not bother moving the mapping past the heap (since
  283                  * the heap is usually above 2GB).
  284                  */
  285                 if (addr + size > MAP_32BIT_MAX_ADDR)
  286                         addr = 0;
  287 #endif
  288         } else {
  289                 /*
  290                  * XXX for non-fixed mappings where no hint is provided or
  291                  * the hint would fall in the potential heap space,
  292                  * place it after the end of the largest possible heap.
  293                  *
  294                  * There should really be a pmap call to determine a reasonable
  295                  * location.
  296                  */
  297                 PROC_LOCK(td->td_proc);
  298                 if (addr == 0 ||
  299                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  300                     addr < round_page((vm_offset_t)vms->vm_daddr +
  301                     lim_max(td->td_proc, RLIMIT_DATA))))
  302                         addr = round_page((vm_offset_t)vms->vm_daddr +
  303                             lim_max(td->td_proc, RLIMIT_DATA));
  304                 PROC_UNLOCK(td->td_proc);
  305         }
  306         if ((flags & MAP_GUARD) != 0) {
  307                 handle = NULL;
  308                 handle_type = OBJT_DEFAULT;
  309                 maxprot = VM_PROT_NONE;
  310                 cap_maxprot = VM_PROT_NONE;
  311         } else if ((flags & MAP_ANON) != 0) {
  312                 /*
  313                  * Mapping blank space is trivial.
  314                  */
  315                 handle = NULL;
  316                 handle_type = OBJT_DEFAULT;
  317                 maxprot = VM_PROT_ALL;
  318                 cap_maxprot = VM_PROT_ALL;
  319         } else {
  320                 /*
  321                  * Mapping file, get fp for validation and don't let the
  322                  * descriptor disappear on us if we block. Check capability
  323                  * rights, but also return the maximum rights to be combined
  324                  * with maxprot later.
  325                  */
  326                 cap_rights_init(&rights, CAP_MMAP);
  327                 if (prot & PROT_READ)
  328                         cap_rights_set(&rights, CAP_MMAP_R);
  329                 if ((flags & MAP_SHARED) != 0) {
  330                         if (prot & PROT_WRITE)
  331                                 cap_rights_set(&rights, CAP_MMAP_W);
  332                 }
  333                 if (prot & PROT_EXEC)
  334                         cap_rights_set(&rights, CAP_MMAP_X);
  335                 error = fget_mmap(td, uap->fd, &rights, &cap_maxprot, &fp);
  336                 if (error != 0)
  337                         goto done;
  338                 if (fp->f_type == DTYPE_SHM) {
  339                         handle = fp->f_data;
  340                         handle_type = OBJT_SWAP;
  341                         maxprot = VM_PROT_NONE;
  342 
  343                         /* FREAD should always be set. */
  344                         if (fp->f_flag & FREAD)
  345                                 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
  346                         if (fp->f_flag & FWRITE)
  347                                 maxprot |= VM_PROT_WRITE;
  348                         goto map;
  349                 }
  350                 if (fp->f_type != DTYPE_VNODE) {
  351                         error = ENODEV;
  352                         goto done;
  353                 }
  354 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
  355     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
  356                 /*
  357                  * POSIX shared-memory objects are defined to have
  358                  * kernel persistence, and are not defined to support
  359                  * read(2)/write(2) -- or even open(2).  Thus, we can
  360                  * use MAP_ASYNC to trade on-disk coherence for speed.
  361                  * The shm_open(3) library routine turns on the FPOSIXSHM
  362                  * flag to request this behavior.
  363                  */
  364                 if (fp->f_flag & FPOSIXSHM)
  365                         flags |= MAP_NOSYNC;
  366 #endif
  367                 vp = fp->f_vnode;
  368                 /*
  369                  * Ensure that file and memory protections are
  370                  * compatible.  Note that we only worry about
  371                  * writability if mapping is shared; in this case,
  372                  * current and max prot are dictated by the open file.
  373                  * XXX use the vnode instead?  Problem is: what
  374                  * credentials do we use for determination? What if
  375                  * proc does a setuid?
  376                  */
  377                 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
  378                         maxprot = VM_PROT_NONE;
  379                 else
  380                         maxprot = VM_PROT_EXECUTE;
  381                 if (fp->f_flag & FREAD) {
  382                         maxprot |= VM_PROT_READ;
  383                 } else if (prot & PROT_READ) {
  384                         error = EACCES;
  385                         goto done;
  386                 }
  387                 /*
  388                  * If we are sharing potential changes (either via
  389                  * MAP_SHARED or via the implicit sharing of character
  390                  * device mappings), and we are trying to get write
  391                  * permission although we opened it without asking
  392                  * for it, bail out.
  393                  */
  394                 if ((flags & MAP_SHARED) != 0) {
  395                         if ((fp->f_flag & FWRITE) != 0) {
  396                                 maxprot |= VM_PROT_WRITE;
  397                         } else if ((prot & PROT_WRITE) != 0) {
  398                                 error = EACCES;
  399                                 goto done;
  400                         }
  401                 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
  402                         maxprot |= VM_PROT_WRITE;
  403                         cap_maxprot |= VM_PROT_WRITE;
  404                 }
  405                 handle = (void *)vp;
  406                 handle_type = OBJT_VNODE;
  407         }
  408 map:
  409         td->td_fpop = fp;
  410         maxprot &= cap_maxprot;
  411         error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  412             flags, handle_type, handle, pos);
  413         td->td_fpop = NULL;
  414 #ifdef HWPMC_HOOKS
  415         /* inform hwpmc(4) if an executable is being mapped */
  416         if (error == 0 && handle_type == OBJT_VNODE &&
  417             (prot & PROT_EXEC)) {
  418                 pkm.pm_file = handle;
  419                 pkm.pm_address = (uintptr_t) addr;
  420                 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
  421         }
  422 #endif
  423         if (error == 0)
  424                 td->td_retval[0] = (register_t) (addr + pageoff);
  425 done:
  426         if (fp)
  427                 fdrop(fp, td);
  428 
  429         return (error);
  430 }
  431 
  432 int
  433 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
  434 {
  435         struct mmap_args oargs;
  436 
  437         oargs.addr = uap->addr;
  438         oargs.len = uap->len;
  439         oargs.prot = uap->prot;
  440         oargs.flags = uap->flags;
  441         oargs.fd = uap->fd;
  442         oargs.pos = uap->pos;
  443         return (sys_mmap(td, &oargs));
  444 }
  445 
  446 #ifdef COMPAT_43
  447 #ifndef _SYS_SYSPROTO_H_
  448 struct ommap_args {
  449         caddr_t addr;
  450         int len;
  451         int prot;
  452         int flags;
  453         int fd;
  454         long pos;
  455 };
  456 #endif
  457 int
  458 ommap(td, uap)
  459         struct thread *td;
  460         struct ommap_args *uap;
  461 {
  462         struct mmap_args nargs;
  463         static const char cvtbsdprot[8] = {
  464                 0,
  465                 PROT_EXEC,
  466                 PROT_WRITE,
  467                 PROT_EXEC | PROT_WRITE,
  468                 PROT_READ,
  469                 PROT_EXEC | PROT_READ,
  470                 PROT_WRITE | PROT_READ,
  471                 PROT_EXEC | PROT_WRITE | PROT_READ,
  472         };
  473 
  474 #define OMAP_ANON       0x0002
  475 #define OMAP_COPY       0x0020
  476 #define OMAP_SHARED     0x0010
  477 #define OMAP_FIXED      0x0100
  478 
  479         nargs.addr = uap->addr;
  480         nargs.len = uap->len;
  481         nargs.prot = cvtbsdprot[uap->prot & 0x7];
  482 #ifdef COMPAT_FREEBSD32
  483 #if defined(__amd64__) || defined(__ia64__)
  484         if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
  485             nargs.prot != 0)
  486                 nargs.prot |= PROT_EXEC;
  487 #endif
  488 #endif
  489         nargs.flags = 0;
  490         if (uap->flags & OMAP_ANON)
  491                 nargs.flags |= MAP_ANON;
  492         if (uap->flags & OMAP_COPY)
  493                 nargs.flags |= MAP_COPY;
  494         if (uap->flags & OMAP_SHARED)
  495                 nargs.flags |= MAP_SHARED;
  496         else
  497                 nargs.flags |= MAP_PRIVATE;
  498         if (uap->flags & OMAP_FIXED)
  499                 nargs.flags |= MAP_FIXED;
  500         nargs.fd = uap->fd;
  501         nargs.pos = uap->pos;
  502         return (sys_mmap(td, &nargs));
  503 }
  504 #endif                          /* COMPAT_43 */
  505 
  506 
  507 #ifndef _SYS_SYSPROTO_H_
  508 struct msync_args {
  509         void *addr;
  510         size_t len;
  511         int flags;
  512 };
  513 #endif
  514 int
  515 sys_msync(td, uap)
  516         struct thread *td;
  517         struct msync_args *uap;
  518 {
  519         vm_offset_t addr;
  520         vm_size_t size, pageoff;
  521         int flags;
  522         vm_map_t map;
  523         int rv;
  524 
  525         addr = (vm_offset_t) uap->addr;
  526         size = uap->len;
  527         flags = uap->flags;
  528 
  529         pageoff = (addr & PAGE_MASK);
  530         addr -= pageoff;
  531         size += pageoff;
  532         size = (vm_size_t) round_page(size);
  533         if (addr + size < addr)
  534                 return (EINVAL);
  535 
  536         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  537                 return (EINVAL);
  538 
  539         map = &td->td_proc->p_vmspace->vm_map;
  540 
  541         /*
  542          * Clean the pages and interpret the return value.
  543          */
  544         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  545             (flags & MS_INVALIDATE) != 0);
  546         switch (rv) {
  547         case KERN_SUCCESS:
  548                 return (0);
  549         case KERN_INVALID_ADDRESS:
  550                 return (ENOMEM);
  551         case KERN_INVALID_ARGUMENT:
  552                 return (EBUSY);
  553         case KERN_FAILURE:
  554                 return (EIO);
  555         default:
  556                 return (EINVAL);
  557         }
  558 }
  559 
  560 #ifndef _SYS_SYSPROTO_H_
  561 struct munmap_args {
  562         void *addr;
  563         size_t len;
  564 };
  565 #endif
  566 int
  567 sys_munmap(td, uap)
  568         struct thread *td;
  569         struct munmap_args *uap;
  570 {
  571 #ifdef HWPMC_HOOKS
  572         struct pmckern_map_out pkm;
  573         vm_map_entry_t entry;
  574 #endif
  575         vm_offset_t addr;
  576         vm_size_t size, pageoff;
  577         vm_map_t map;
  578 
  579         addr = (vm_offset_t) uap->addr;
  580         size = uap->len;
  581         if (size == 0)
  582                 return (EINVAL);
  583 
  584         pageoff = (addr & PAGE_MASK);
  585         addr -= pageoff;
  586         size += pageoff;
  587         size = (vm_size_t) round_page(size);
  588         if (addr + size < addr)
  589                 return (EINVAL);
  590 
  591         /*
  592          * Check for illegal addresses.  Watch out for address wrap...
  593          */
  594         map = &td->td_proc->p_vmspace->vm_map;
  595         if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
  596                 return (EINVAL);
  597         vm_map_lock(map);
  598 #ifdef HWPMC_HOOKS
  599         /*
  600          * Inform hwpmc if the address range being unmapped contains
  601          * an executable region.
  602          */
  603         pkm.pm_address = (uintptr_t) NULL;
  604         if (vm_map_lookup_entry(map, addr, &entry)) {
  605                 for (;
  606                      entry != &map->header && entry->start < addr + size;
  607                      entry = entry->next) {
  608                         if (vm_map_check_protection(map, entry->start,
  609                                 entry->end, VM_PROT_EXECUTE) == TRUE) {
  610                                 pkm.pm_address = (uintptr_t) addr;
  611                                 pkm.pm_size = (size_t) size;
  612                                 break;
  613                         }
  614                 }
  615         }
  616 #endif
  617         vm_map_delete(map, addr, addr + size);
  618 
  619 #ifdef HWPMC_HOOKS
  620         /* downgrade the lock to prevent a LOR with the pmc-sx lock */
  621         vm_map_lock_downgrade(map);
  622         if (pkm.pm_address != (uintptr_t) NULL)
  623                 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
  624         vm_map_unlock_read(map);
  625 #else
  626         vm_map_unlock(map);
  627 #endif
  628         /* vm_map_delete returns nothing but KERN_SUCCESS anyway */
  629         return (0);
  630 }
  631 
  632 #ifndef _SYS_SYSPROTO_H_
  633 struct mprotect_args {
  634         const void *addr;
  635         size_t len;
  636         int prot;
  637 };
  638 #endif
  639 int
  640 sys_mprotect(td, uap)
  641         struct thread *td;
  642         struct mprotect_args *uap;
  643 {
  644         vm_offset_t addr;
  645         vm_size_t size, pageoff;
  646         vm_prot_t prot;
  647 
  648         addr = (vm_offset_t) uap->addr;
  649         size = uap->len;
  650         prot = uap->prot & VM_PROT_ALL;
  651 
  652         pageoff = (addr & PAGE_MASK);
  653         addr -= pageoff;
  654         size += pageoff;
  655         size = (vm_size_t) round_page(size);
  656         if (addr + size < addr)
  657                 return (EINVAL);
  658 
  659         switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
  660             addr + size, prot, FALSE)) {
  661         case KERN_SUCCESS:
  662                 return (0);
  663         case KERN_PROTECTION_FAILURE:
  664                 return (EACCES);
  665         case KERN_RESOURCE_SHORTAGE:
  666                 return (ENOMEM);
  667         }
  668         return (EINVAL);
  669 }
  670 
  671 #ifndef _SYS_SYSPROTO_H_
  672 struct minherit_args {
  673         void *addr;
  674         size_t len;
  675         int inherit;
  676 };
  677 #endif
  678 int
  679 sys_minherit(struct thread *td, struct minherit_args *uap)
  680 {
  681         vm_offset_t addr;
  682         vm_size_t size, pageoff;
  683         vm_inherit_t inherit;
  684 
  685         addr = (vm_offset_t)uap->addr;
  686         size = uap->len;
  687         inherit = uap->inherit;
  688 
  689         pageoff = (addr & PAGE_MASK);
  690         addr -= pageoff;
  691         size += pageoff;
  692         size = (vm_size_t) round_page(size);
  693         if (addr + size < addr)
  694                 return (EINVAL);
  695 
  696         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  697             addr + size, inherit)) {
  698         case KERN_SUCCESS:
  699                 return (0);
  700         case KERN_PROTECTION_FAILURE:
  701                 return (EACCES);
  702         }
  703         return (EINVAL);
  704 }
  705 
  706 #ifndef _SYS_SYSPROTO_H_
  707 struct madvise_args {
  708         void *addr;
  709         size_t len;
  710         int behav;
  711 };
  712 #endif
  713 
  714 int
  715 sys_madvise(struct thread *td, struct madvise_args *uap)
  716 {
  717         vm_offset_t start, end;
  718         vm_map_t map;
  719         int flags;
  720 
  721         /*
  722          * Check for our special case, advising the swap pager we are
  723          * "immortal."
  724          */
  725         if (uap->behav == MADV_PROTECT) {
  726                 flags = PPROT_SET;
  727                 return (kern_procctl(td, P_PID, td->td_proc->p_pid,
  728                     PROC_SPROTECT, &flags));
  729         }
  730 
  731         /*
  732          * Check for illegal behavior
  733          */
  734         if (uap->behav < 0 || uap->behav > MADV_CORE)
  735                 return (EINVAL);
  736         /*
  737          * Check for illegal addresses.  Watch out for address wrap... Note
  738          * that VM_*_ADDRESS are not constants due to casts (argh).
  739          */
  740         map = &td->td_proc->p_vmspace->vm_map;
  741         if ((vm_offset_t)uap->addr < vm_map_min(map) ||
  742             (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
  743                 return (EINVAL);
  744         if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
  745                 return (EINVAL);
  746 
  747         /*
  748          * Since this routine is only advisory, we default to conservative
  749          * behavior.
  750          */
  751         start = trunc_page((vm_offset_t) uap->addr);
  752         end = round_page((vm_offset_t) uap->addr + uap->len);
  753 
  754         if (vm_map_madvise(map, start, end, uap->behav))
  755                 return (EINVAL);
  756         return (0);
  757 }
  758 
  759 #ifndef _SYS_SYSPROTO_H_
  760 struct mincore_args {
  761         const void *addr;
  762         size_t len;
  763         char *vec;
  764 };
  765 #endif
  766 
  767 int
  768 sys_mincore(struct thread *td, struct mincore_args *uap)
  769 {
  770         vm_offset_t addr, first_addr;
  771         vm_offset_t end, cend;
  772         pmap_t pmap;
  773         vm_map_t map;
  774         char *vec;
  775         int error = 0;
  776         int vecindex, lastvecindex;
  777         vm_map_entry_t current;
  778         vm_map_entry_t entry;
  779         vm_object_t object;
  780         vm_paddr_t locked_pa;
  781         vm_page_t m;
  782         vm_pindex_t pindex;
  783         int mincoreinfo;
  784         unsigned int timestamp;
  785         boolean_t locked;
  786 
  787         /*
  788          * Make sure that the addresses presented are valid for user
  789          * mode.
  790          */
  791         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  792         end = addr + (vm_size_t)round_page(uap->len);
  793         map = &td->td_proc->p_vmspace->vm_map;
  794         if (end > vm_map_max(map) || end < addr)
  795                 return (ENOMEM);
  796 
  797         /*
  798          * Address of byte vector
  799          */
  800         vec = uap->vec;
  801 
  802         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  803 
  804         vm_map_lock_read(map);
  805 RestartScan:
  806         timestamp = map->timestamp;
  807 
  808         if (!vm_map_lookup_entry(map, addr, &entry)) {
  809                 vm_map_unlock_read(map);
  810                 return (ENOMEM);
  811         }
  812 
  813         /*
  814          * Do this on a map entry basis so that if the pages are not
  815          * in the current processes address space, we can easily look
  816          * up the pages elsewhere.
  817          */
  818         lastvecindex = -1;
  819         for (current = entry;
  820             (current != &map->header) && (current->start < end);
  821             current = current->next) {
  822 
  823                 /*
  824                  * check for contiguity
  825                  */
  826                 if (current->end < end &&
  827                     (entry->next == &map->header ||
  828                      current->next->start > current->end)) {
  829                         vm_map_unlock_read(map);
  830                         return (ENOMEM);
  831                 }
  832 
  833                 /*
  834                  * ignore submaps (for now) or null objects
  835                  */
  836                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  837                         current->object.vm_object == NULL)
  838                         continue;
  839 
  840                 /*
  841                  * limit this scan to the current map entry and the
  842                  * limits for the mincore call
  843                  */
  844                 if (addr < current->start)
  845                         addr = current->start;
  846                 cend = current->end;
  847                 if (cend > end)
  848                         cend = end;
  849 
  850                 /*
  851                  * scan this entry one page at a time
  852                  */
  853                 while (addr < cend) {
  854                         /*
  855                          * Check pmap first, it is likely faster, also
  856                          * it can provide info as to whether we are the
  857                          * one referencing or modifying the page.
  858                          */
  859                         object = NULL;
  860                         locked_pa = 0;
  861                 retry:
  862                         m = NULL;
  863                         mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
  864                         if (locked_pa != 0) {
  865                                 /*
  866                                  * The page is mapped by this process but not
  867                                  * both accessed and modified.  It is also
  868                                  * managed.  Acquire the object lock so that
  869                                  * other mappings might be examined.
  870                                  */
  871                                 m = PHYS_TO_VM_PAGE(locked_pa);
  872                                 if (m->object != object) {
  873                                         if (object != NULL)
  874                                                 VM_OBJECT_WUNLOCK(object);
  875                                         object = m->object;
  876                                         locked = VM_OBJECT_TRYWLOCK(object);
  877                                         vm_page_unlock(m);
  878                                         if (!locked) {
  879                                                 VM_OBJECT_WLOCK(object);
  880                                                 vm_page_lock(m);
  881                                                 goto retry;
  882                                         }
  883                                 } else
  884                                         vm_page_unlock(m);
  885                                 KASSERT(m->valid == VM_PAGE_BITS_ALL,
  886                                     ("mincore: page %p is mapped but invalid",
  887                                     m));
  888                         } else if (mincoreinfo == 0) {
  889                                 /*
  890                                  * The page is not mapped by this process.  If
  891                                  * the object implements managed pages, then
  892                                  * determine if the page is resident so that
  893                                  * the mappings might be examined.
  894                                  */
  895                                 if (current->object.vm_object != object) {
  896                                         if (object != NULL)
  897                                                 VM_OBJECT_WUNLOCK(object);
  898                                         object = current->object.vm_object;
  899                                         VM_OBJECT_WLOCK(object);
  900                                 }
  901                                 if (object->type == OBJT_DEFAULT ||
  902                                     object->type == OBJT_SWAP ||
  903                                     object->type == OBJT_VNODE) {
  904                                         pindex = OFF_TO_IDX(current->offset +
  905                                             (addr - current->start));
  906                                         m = vm_page_lookup(object, pindex);
  907                                         if (m == NULL &&
  908                                             vm_page_is_cached(object, pindex))
  909                                                 mincoreinfo = MINCORE_INCORE;
  910                                         if (m != NULL && m->valid == 0)
  911                                                 m = NULL;
  912                                         if (m != NULL)
  913                                                 mincoreinfo = MINCORE_INCORE;
  914                                 }
  915                         }
  916                         if (m != NULL) {
  917                                 /* Examine other mappings to the page. */
  918                                 if (m->dirty == 0 && pmap_is_modified(m))
  919                                         vm_page_dirty(m);
  920                                 if (m->dirty != 0)
  921                                         mincoreinfo |= MINCORE_MODIFIED_OTHER;
  922                                 /*
  923                                  * The first test for PGA_REFERENCED is an
  924                                  * optimization.  The second test is
  925                                  * required because a concurrent pmap
  926                                  * operation could clear the last reference
  927                                  * and set PGA_REFERENCED before the call to
  928                                  * pmap_is_referenced(). 
  929                                  */
  930                                 if ((m->aflags & PGA_REFERENCED) != 0 ||
  931                                     pmap_is_referenced(m) ||
  932                                     (m->aflags & PGA_REFERENCED) != 0)
  933                                         mincoreinfo |= MINCORE_REFERENCED_OTHER;
  934                         }
  935                         if (object != NULL)
  936                                 VM_OBJECT_WUNLOCK(object);
  937 
  938                         /*
  939                          * subyte may page fault.  In case it needs to modify
  940                          * the map, we release the lock.
  941                          */
  942                         vm_map_unlock_read(map);
  943 
  944                         /*
  945                          * calculate index into user supplied byte vector
  946                          */
  947                         vecindex = OFF_TO_IDX(addr - first_addr);
  948 
  949                         /*
  950                          * If we have skipped map entries, we need to make sure that
  951                          * the byte vector is zeroed for those skipped entries.
  952                          */
  953                         while ((lastvecindex + 1) < vecindex) {
  954                                 ++lastvecindex;
  955                                 error = subyte(vec + lastvecindex, 0);
  956                                 if (error) {
  957                                         error = EFAULT;
  958                                         goto done2;
  959                                 }
  960                         }
  961 
  962                         /*
  963                          * Pass the page information to the user
  964                          */
  965                         error = subyte(vec + vecindex, mincoreinfo);
  966                         if (error) {
  967                                 error = EFAULT;
  968                                 goto done2;
  969                         }
  970 
  971                         /*
  972                          * If the map has changed, due to the subyte, the previous
  973                          * output may be invalid.
  974                          */
  975                         vm_map_lock_read(map);
  976                         if (timestamp != map->timestamp)
  977                                 goto RestartScan;
  978 
  979                         lastvecindex = vecindex;
  980                         addr += PAGE_SIZE;
  981                 }
  982         }
  983 
  984         /*
  985          * subyte may page fault.  In case it needs to modify
  986          * the map, we release the lock.
  987          */
  988         vm_map_unlock_read(map);
  989 
  990         /*
  991          * Zero the last entries in the byte vector.
  992          */
  993         vecindex = OFF_TO_IDX(end - first_addr);
  994         while ((lastvecindex + 1) < vecindex) {
  995                 ++lastvecindex;
  996                 error = subyte(vec + lastvecindex, 0);
  997                 if (error) {
  998                         error = EFAULT;
  999                         goto done2;
 1000                 }
 1001         }
 1002 
 1003         /*
 1004          * If the map has changed, due to the subyte, the previous
 1005          * output may be invalid.
 1006          */
 1007         vm_map_lock_read(map);
 1008         if (timestamp != map->timestamp)
 1009                 goto RestartScan;
 1010         vm_map_unlock_read(map);
 1011 done2:
 1012         return (error);
 1013 }
 1014 
 1015 #ifndef _SYS_SYSPROTO_H_
 1016 struct mlock_args {
 1017         const void *addr;
 1018         size_t len;
 1019 };
 1020 #endif
 1021 int
 1022 sys_mlock(struct thread *td, struct mlock_args *uap)
 1023 {
 1024 
 1025         return (vm_mlock(td->td_proc, td->td_ucred, uap->addr, uap->len));
 1026 }
 1027 
 1028 int
 1029 vm_mlock(struct proc *proc, struct ucred *cred, const void *addr0, size_t len)
 1030 {
 1031         vm_offset_t addr, end, last, start;
 1032         vm_size_t npages, size;
 1033         vm_map_t map;
 1034         unsigned long nsize;
 1035         int error;
 1036 
 1037         error = priv_check_cred(cred, PRIV_VM_MLOCK, 0);
 1038         if (error)
 1039                 return (error);
 1040         addr = (vm_offset_t)addr0;
 1041         size = len;
 1042         last = addr + size;
 1043         start = trunc_page(addr);
 1044         end = round_page(last);
 1045         if (last < addr || end < addr)
 1046                 return (EINVAL);
 1047         npages = atop(end - start);
 1048         if (npages > vm_page_max_wired)
 1049                 return (ENOMEM);
 1050         map = &proc->p_vmspace->vm_map;
 1051         PROC_LOCK(proc);
 1052         nsize = ptoa(npages + pmap_wired_count(map->pmap));
 1053         if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
 1054                 PROC_UNLOCK(proc);
 1055                 return (ENOMEM);
 1056         }
 1057         PROC_UNLOCK(proc);
 1058         if (npages + cnt.v_wire_count > vm_page_max_wired)
 1059                 return (EAGAIN);
 1060 #ifdef RACCT
 1061         if (racct_enable) {
 1062                 PROC_LOCK(proc);
 1063                 error = racct_set(proc, RACCT_MEMLOCK, nsize);
 1064                 PROC_UNLOCK(proc);
 1065                 if (error != 0)
 1066                         return (ENOMEM);
 1067         }
 1068 #endif
 1069         error = vm_map_wire(map, start, end,
 1070             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1071 #ifdef RACCT
 1072         if (racct_enable && error != KERN_SUCCESS) {
 1073                 PROC_LOCK(proc);
 1074                 racct_set(proc, RACCT_MEMLOCK,
 1075                     ptoa(pmap_wired_count(map->pmap)));
 1076                 PROC_UNLOCK(proc);
 1077         }
 1078 #endif
 1079         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1080 }
 1081 
 1082 #ifndef _SYS_SYSPROTO_H_
 1083 struct mlockall_args {
 1084         int     how;
 1085 };
 1086 #endif
 1087 
 1088 int
 1089 sys_mlockall(struct thread *td, struct mlockall_args *uap)
 1090 {
 1091         vm_map_t map;
 1092         int error;
 1093 
 1094         map = &td->td_proc->p_vmspace->vm_map;
 1095         error = priv_check(td, PRIV_VM_MLOCK);
 1096         if (error)
 1097                 return (error);
 1098 
 1099         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 1100                 return (EINVAL);
 1101 
 1102         /*
 1103          * If wiring all pages in the process would cause it to exceed
 1104          * a hard resource limit, return ENOMEM.
 1105          */
 1106         if (!old_mlock && uap->how & MCL_CURRENT) {
 1107                 PROC_LOCK(td->td_proc);
 1108                 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 1109                         PROC_UNLOCK(td->td_proc);
 1110                         return (ENOMEM);
 1111                 }
 1112                 PROC_UNLOCK(td->td_proc);
 1113         }
 1114 #ifdef RACCT
 1115         if (racct_enable) {
 1116                 PROC_LOCK(td->td_proc);
 1117                 error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 1118                 PROC_UNLOCK(td->td_proc);
 1119                 if (error != 0)
 1120                         return (ENOMEM);
 1121         }
 1122 #endif
 1123 
 1124         if (uap->how & MCL_FUTURE) {
 1125                 vm_map_lock(map);
 1126                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
 1127                 vm_map_unlock(map);
 1128                 error = 0;
 1129         }
 1130 
 1131         if (uap->how & MCL_CURRENT) {
 1132                 /*
 1133                  * P1003.1-2001 mandates that all currently mapped pages
 1134                  * will be memory resident and locked (wired) upon return
 1135                  * from mlockall(). vm_map_wire() will wire pages, by
 1136                  * calling vm_fault_wire() for each page in the region.
 1137                  */
 1138                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 1139                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1140                 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 1141         }
 1142 #ifdef RACCT
 1143         if (racct_enable && error != KERN_SUCCESS) {
 1144                 PROC_LOCK(td->td_proc);
 1145                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1146                     ptoa(pmap_wired_count(map->pmap)));
 1147                 PROC_UNLOCK(td->td_proc);
 1148         }
 1149 #endif
 1150 
 1151         return (error);
 1152 }
 1153 
 1154 #ifndef _SYS_SYSPROTO_H_
 1155 struct munlockall_args {
 1156         register_t dummy;
 1157 };
 1158 #endif
 1159 
 1160 int
 1161 sys_munlockall(struct thread *td, struct munlockall_args *uap)
 1162 {
 1163         vm_map_t map;
 1164         int error;
 1165 
 1166         map = &td->td_proc->p_vmspace->vm_map;
 1167         error = priv_check(td, PRIV_VM_MUNLOCK);
 1168         if (error)
 1169                 return (error);
 1170 
 1171         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1172         vm_map_lock(map);
 1173         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1174         vm_map_unlock(map);
 1175 
 1176         /* Forcibly unwire all pages. */
 1177         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1178             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1179 #ifdef RACCT
 1180         if (racct_enable && error == KERN_SUCCESS) {
 1181                 PROC_LOCK(td->td_proc);
 1182                 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 1183                 PROC_UNLOCK(td->td_proc);
 1184         }
 1185 #endif
 1186 
 1187         return (error);
 1188 }
 1189 
 1190 #ifndef _SYS_SYSPROTO_H_
 1191 struct munlock_args {
 1192         const void *addr;
 1193         size_t len;
 1194 };
 1195 #endif
 1196 int
 1197 sys_munlock(td, uap)
 1198         struct thread *td;
 1199         struct munlock_args *uap;
 1200 {
 1201         vm_offset_t addr, end, last, start;
 1202         vm_size_t size;
 1203 #ifdef RACCT
 1204         vm_map_t map;
 1205 #endif
 1206         int error;
 1207 
 1208         error = priv_check(td, PRIV_VM_MUNLOCK);
 1209         if (error)
 1210                 return (error);
 1211         addr = (vm_offset_t)uap->addr;
 1212         size = uap->len;
 1213         last = addr + size;
 1214         start = trunc_page(addr);
 1215         end = round_page(last);
 1216         if (last < addr || end < addr)
 1217                 return (EINVAL);
 1218         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 1219             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1220 #ifdef RACCT
 1221         if (racct_enable && error == KERN_SUCCESS) {
 1222                 PROC_LOCK(td->td_proc);
 1223                 map = &td->td_proc->p_vmspace->vm_map;
 1224                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1225                     ptoa(pmap_wired_count(map->pmap)));
 1226                 PROC_UNLOCK(td->td_proc);
 1227         }
 1228 #endif
 1229         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1230 }
 1231 
 1232 /*
 1233  * vm_mmap_vnode()
 1234  *
 1235  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1236  * operations on vnodes.
 1237  *
 1238  * For VCHR vnodes, the vnode lock is held over the call to
 1239  * vm_mmap_cdev() to keep vp->v_rdev valid.
 1240  */
 1241 int
 1242 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 1243     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1244     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
 1245     boolean_t *writecounted)
 1246 {
 1247         struct vattr va;
 1248         vm_object_t obj;
 1249         vm_offset_t foff;
 1250         struct ucred *cred;
 1251         int error, flags, locktype;
 1252 
 1253         cred = td->td_ucred;
 1254         if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 1255                 locktype = LK_EXCLUSIVE;
 1256         else
 1257                 locktype = LK_SHARED;
 1258         if ((error = vget(vp, locktype, td)) != 0)
 1259                 return (error);
 1260         foff = *foffp;
 1261         flags = *flagsp;
 1262         obj = vp->v_object;
 1263         if (vp->v_type == VREG) {
 1264                 /*
 1265                  * Get the proper underlying object
 1266                  */
 1267                 if (obj == NULL) {
 1268                         error = EINVAL;
 1269                         goto done;
 1270                 }
 1271                 if (obj->type == OBJT_VNODE && obj->handle != vp) {
 1272                         vput(vp);
 1273                         vp = (struct vnode *)obj->handle;
 1274                         /*
 1275                          * Bypass filesystems obey the mpsafety of the
 1276                          * underlying fs.  Tmpfs never bypasses.
 1277                          */
 1278                         error = vget(vp, locktype, td);
 1279                         if (error != 0)
 1280                                 return (error);
 1281                 }
 1282                 if (locktype == LK_EXCLUSIVE) {
 1283                         *writecounted = TRUE;
 1284                         vnode_pager_update_writecount(obj, 0, objsize);
 1285                 }
 1286         } else if (vp->v_type == VCHR) {
 1287                 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
 1288                     vp->v_rdev, foffp, objp);
 1289                 if (error == 0)
 1290                         goto mark_atime;
 1291                 goto done;
 1292         } else {
 1293                 error = EINVAL;
 1294                 goto done;
 1295         }
 1296         if ((error = VOP_GETATTR(vp, &va, cred)))
 1297                 goto done;
 1298 #ifdef MAC
 1299         error = mac_vnode_check_mmap(cred, vp, prot, flags);
 1300         if (error != 0)
 1301                 goto done;
 1302 #endif
 1303         if ((flags & MAP_SHARED) != 0) {
 1304                 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 1305                         if (prot & PROT_WRITE) {
 1306                                 error = EPERM;
 1307                                 goto done;
 1308                         }
 1309                         *maxprotp &= ~VM_PROT_WRITE;
 1310                 }
 1311         }
 1312         /*
 1313          * If it is a regular file without any references
 1314          * we do not need to sync it.
 1315          * Adjust object size to be the size of actual file.
 1316          */
 1317         objsize = round_page(va.va_size);
 1318         if (va.va_nlink == 0)
 1319                 flags |= MAP_NOSYNC;
 1320         if (obj->type == OBJT_VNODE)
 1321                 obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff,
 1322                     cred);
 1323         else {
 1324                 KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP,
 1325                     ("wrong object type"));
 1326                 vm_object_reference(obj);
 1327         }
 1328         if (obj == NULL) {
 1329                 error = ENOMEM;
 1330                 goto done;
 1331         }
 1332         *objp = obj;
 1333         *flagsp = flags;
 1334 
 1335 mark_atime:
 1336         vfs_mark_atime(vp, cred);
 1337 
 1338 done:
 1339         if (error != 0 && *writecounted) {
 1340                 *writecounted = FALSE;
 1341                 vnode_pager_update_writecount(obj, objsize, 0);
 1342         }
 1343         vput(vp);
 1344         return (error);
 1345 }
 1346 
 1347 /*
 1348  * vm_mmap_cdev()
 1349  *
 1350  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1351  * operations on cdevs.
 1352  */
 1353 int
 1354 vm_mmap_cdev(struct thread *td, vm_size_t objsize,
 1355     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1356     struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
 1357 {
 1358         vm_object_t obj;
 1359         struct cdevsw *dsw;
 1360         int error, flags, ref;
 1361 
 1362         flags = *flagsp;
 1363 
 1364         dsw = dev_refthread(cdev, &ref);
 1365         if (dsw == NULL)
 1366                 return (ENXIO);
 1367         if (dsw->d_flags & D_MMAP_ANON) {
 1368                 dev_relthread(cdev, ref);
 1369                 *maxprotp = VM_PROT_ALL;
 1370                 *flagsp |= MAP_ANON;
 1371                 return (0);
 1372         }
 1373         /*
 1374          * cdevs do not provide private mappings of any kind.
 1375          */
 1376         if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 1377             (prot & PROT_WRITE) != 0) {
 1378                 dev_relthread(cdev, ref);
 1379                 return (EACCES);
 1380         }
 1381         if (flags & (MAP_PRIVATE|MAP_COPY)) {
 1382                 dev_relthread(cdev, ref);
 1383                 return (EINVAL);
 1384         }
 1385         /*
 1386          * Force device mappings to be shared.
 1387          */
 1388         flags |= MAP_SHARED;
 1389 #ifdef MAC_XXX
 1390         error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
 1391         if (error != 0) {
 1392                 dev_relthread(cdev, ref);
 1393                 return (error);
 1394         }
 1395 #endif
 1396         /*
 1397          * First, try d_mmap_single().  If that is not implemented
 1398          * (returns ENODEV), fall back to using the device pager.
 1399          * Note that d_mmap_single() must return a reference to the
 1400          * object (it needs to bump the reference count of the object
 1401          * it returns somehow).
 1402          *
 1403          * XXX assumes VM_PROT_* == PROT_*
 1404          */
 1405         error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 1406         dev_relthread(cdev, ref);
 1407         if (error != ENODEV)
 1408                 return (error);
 1409         obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 1410             td->td_ucred);
 1411         if (obj == NULL)
 1412                 return (EINVAL);
 1413         *objp = obj;
 1414         *flagsp = flags;
 1415         return (0);
 1416 }
 1417 
 1418 /*
 1419  * vm_mmap_shm()
 1420  *
 1421  * MPSAFE
 1422  *
 1423  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1424  * operations on shm file descriptors.
 1425  */
 1426 int
 1427 vm_mmap_shm(struct thread *td, vm_size_t objsize,
 1428     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1429     struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
 1430 {
 1431         int error;
 1432 
 1433         if ((*flagsp & MAP_SHARED) != 0 &&
 1434             (*maxprotp & VM_PROT_WRITE) == 0 &&
 1435             (prot & PROT_WRITE) != 0)
 1436                 return (EACCES);
 1437 #ifdef MAC
 1438         error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
 1439         if (error != 0)
 1440                 return (error);
 1441 #endif
 1442         error = shm_mmap(shmfd, objsize, foff, objp);
 1443         if (error)
 1444                 return (error);
 1445         return (0);
 1446 }
 1447 
 1448 /*
 1449  * vm_mmap()
 1450  *
 1451  * MPSAFE
 1452  *
 1453  * Internal version of mmap.  Currently used by mmap, exec, and sys5
 1454  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
 1455  */
 1456 int
 1457 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1458         vm_prot_t maxprot, int flags,
 1459         objtype_t handle_type, void *handle,
 1460         vm_ooffset_t foff)
 1461 {
 1462         boolean_t curmap, fitit;
 1463         vm_offset_t max_addr;
 1464         vm_object_t object = NULL;
 1465         struct thread *td = curthread;
 1466         int docow, error, findspace, rv;
 1467         boolean_t writecounted;
 1468 
 1469         if (size == 0)
 1470                 return (0);
 1471 
 1472         size = round_page(size);
 1473 
 1474         curmap = map == &td->td_proc->p_vmspace->vm_map;
 1475         if (curmap) {
 1476                 PROC_LOCK(td->td_proc);
 1477                 if (map->size + size > lim_cur(td->td_proc, RLIMIT_VMEM)) {
 1478                         PROC_UNLOCK(td->td_proc);
 1479                         return (ENOMEM);
 1480                 }
 1481                 if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) {
 1482                         PROC_UNLOCK(td->td_proc);
 1483                         return (ENOMEM);
 1484                 }
 1485                 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 1486                         if (ptoa(pmap_wired_count(map->pmap)) + size >
 1487                             lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 1488                                 racct_set_force(td->td_proc, RACCT_VMEM,
 1489                                     map->size);
 1490                                 PROC_UNLOCK(td->td_proc);
 1491                                 return (ENOMEM);
 1492                         }
 1493                         error = racct_set(td->td_proc, RACCT_MEMLOCK,
 1494                             ptoa(pmap_wired_count(map->pmap)) + size);
 1495                         if (error != 0) {
 1496                                 racct_set_force(td->td_proc, RACCT_VMEM,
 1497                                     map->size);
 1498                                 PROC_UNLOCK(td->td_proc);
 1499                                 return (error);
 1500                         }
 1501                 }
 1502                 PROC_UNLOCK(td->td_proc);
 1503         }
 1504 
 1505         /*
 1506          * We currently can only deal with page aligned file offsets.
 1507          * The check is here rather than in the syscall because the
 1508          * kernel calls this function internally for other mmaping
 1509          * operations (such as in exec) and non-aligned offsets will
 1510          * cause pmap inconsistencies...so we want to be sure to
 1511          * disallow this in all cases.
 1512          */
 1513         if (foff & PAGE_MASK)
 1514                 return (EINVAL);
 1515 
 1516         if ((flags & MAP_FIXED) == 0) {
 1517                 fitit = TRUE;
 1518                 *addr = round_page(*addr);
 1519         } else {
 1520                 if (*addr != trunc_page(*addr))
 1521                         return (EINVAL);
 1522                 fitit = FALSE;
 1523         }
 1524         writecounted = FALSE;
 1525 
 1526         /*
 1527          * Lookup/allocate object.
 1528          */
 1529         switch (handle_type) {
 1530         case OBJT_DEVICE:
 1531                 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
 1532                     handle, &foff, &object);
 1533                 break;
 1534         case OBJT_VNODE:
 1535                 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 1536                     handle, &foff, &object, &writecounted);
 1537                 break;
 1538         case OBJT_SWAP:
 1539                 error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
 1540                     handle, foff, &object);
 1541                 break;
 1542         case OBJT_DEFAULT:
 1543                 if (handle == NULL) {
 1544                         error = 0;
 1545                         break;
 1546                 }
 1547                 /* FALLTHROUGH */
 1548         default:
 1549                 error = EINVAL;
 1550                 break;
 1551         }
 1552         if (error)
 1553                 return (error);
 1554         if (flags & MAP_ANON) {
 1555                 object = NULL;
 1556                 docow = 0;
 1557                 /*
 1558                  * Unnamed anonymous regions always start at 0.
 1559                  */
 1560                 if (handle == 0)
 1561                         foff = 0;
 1562         } else if (flags & MAP_PREFAULT_READ)
 1563                 docow = MAP_PREFAULT;
 1564         else
 1565                 docow = MAP_PREFAULT_PARTIAL;
 1566 
 1567         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1568                 docow |= MAP_COPY_ON_WRITE;
 1569         if (flags & MAP_NOSYNC)
 1570                 docow |= MAP_DISABLE_SYNCER;
 1571         if (flags & MAP_NOCORE)
 1572                 docow |= MAP_DISABLE_COREDUMP;
 1573         /* Shared memory is also shared with children. */
 1574         if (flags & MAP_SHARED)
 1575                 docow |= MAP_INHERIT_SHARE;
 1576         if (writecounted)
 1577                 docow |= MAP_VN_WRITECOUNT;
 1578         if (flags & MAP_STACK) {
 1579                 if (object != NULL)
 1580                         return (EINVAL);
 1581                 docow |= MAP_STACK_GROWS_DOWN;
 1582         }
 1583         if ((flags & MAP_EXCL) != 0)
 1584                 docow |= MAP_CHECK_EXCL;
 1585         if ((flags & MAP_GUARD) != 0)
 1586                 docow |= MAP_CREATE_GUARD;
 1587 
 1588         if (fitit) {
 1589                 if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER)
 1590                         findspace = VMFS_SUPER_SPACE;
 1591                 else if ((flags & MAP_ALIGNMENT_MASK) != 0)
 1592                         findspace = VMFS_ALIGNED_SPACE(flags >>
 1593                             MAP_ALIGNMENT_SHIFT);
 1594                 else
 1595                         findspace = VMFS_OPTIMAL_SPACE;
 1596                 max_addr = 0;
 1597 #ifdef MAP_32BIT
 1598                 if ((flags & MAP_32BIT) != 0)
 1599                         max_addr = MAP_32BIT_MAX_ADDR;
 1600 #endif
 1601                 if (curmap) {
 1602                         vm_offset_t min_addr;
 1603 
 1604                         PROC_LOCK(td->td_proc);
 1605                         min_addr = round_page((vm_offset_t)td->td_proc->
 1606                             p_vmspace->vm_daddr + lim_max(td->td_proc,
 1607                             RLIMIT_DATA));
 1608                         PROC_UNLOCK(td->td_proc);
 1609                         rv = vm_map_find_min(map, object, foff, addr, size,
 1610                             min_addr, max_addr,
 1611                             findspace, prot, maxprot, docow);
 1612                 } else {
 1613                         rv = vm_map_find(map, object, foff, addr, size,
 1614                             max_addr, findspace, prot, maxprot, docow);
 1615                 }
 1616         } else {
 1617                 rv = vm_map_fixed(map, object, foff, *addr, size,
 1618                     prot, maxprot, docow);
 1619         }
 1620 
 1621         if (rv == KERN_SUCCESS) {
 1622                 /*
 1623                  * If the process has requested that all future mappings
 1624                  * be wired, then heed this.
 1625                  */
 1626                 if (map->flags & MAP_WIREFUTURE) {
 1627                         vm_map_wire(map, *addr, *addr + size,
 1628                             VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 1629                             VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 1630                 }
 1631         } else {
 1632                 /*
 1633                  * If this mapping was accounted for in the vnode's
 1634                  * writecount, then undo that now.
 1635                  */
 1636                 if (writecounted)
 1637                         vnode_pager_release_writecount(object, 0, size);
 1638                 /*
 1639                  * Lose the object reference.  Will destroy the
 1640                  * object if it's an unnamed anonymous mapping
 1641                  * or named anonymous without other references.
 1642                  */
 1643                 vm_object_deallocate(object);
 1644         }
 1645         return (vm_mmap_to_errno(rv));
 1646 }
 1647 
 1648 /*
 1649  * Translate a Mach VM return code to zero on success or the appropriate errno
 1650  * on failure.
 1651  */
 1652 int
 1653 vm_mmap_to_errno(int rv)
 1654 {
 1655 
 1656         switch (rv) {
 1657         case KERN_SUCCESS:
 1658                 return (0);
 1659         case KERN_INVALID_ADDRESS:
 1660         case KERN_NO_SPACE:
 1661                 return (ENOMEM);
 1662         case KERN_PROTECTION_FAILURE:
 1663                 return (EACCES);
 1664         default:
 1665                 return (EINVAL);
 1666         }
 1667 }

Cache object: 6391037c2637188e3ba1df7c8e0d318f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.