The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1988 University of Utah.
    3  * Copyright (c) 1991, 1993
    4  *      The Regents of the University of California.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * the Systems Programming Group of the University of Utah Computer
    8  * Science Department.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   35  *
   36  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   37  */
   38 
   39 /*
   40  * Mapped file (mmap) interface to VM
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD: releng/9.2/sys/vm/vm_mmap.c 253801 2013-07-30 12:17:45Z jlh $");
   45 
   46 #include "opt_compat.h"
   47 #include "opt_hwpmc_hooks.h"
   48 
   49 #include <sys/param.h>
   50 #include <sys/systm.h>
   51 #include <sys/capability.h>
   52 #include <sys/kernel.h>
   53 #include <sys/lock.h>
   54 #include <sys/mutex.h>
   55 #include <sys/sysproto.h>
   56 #include <sys/filedesc.h>
   57 #include <sys/priv.h>
   58 #include <sys/proc.h>
   59 #include <sys/racct.h>
   60 #include <sys/resource.h>
   61 #include <sys/resourcevar.h>
   62 #include <sys/sysctl.h>
   63 #include <sys/vnode.h>
   64 #include <sys/fcntl.h>
   65 #include <sys/file.h>
   66 #include <sys/mman.h>
   67 #include <sys/mount.h>
   68 #include <sys/conf.h>
   69 #include <sys/stat.h>
   70 #include <sys/sysent.h>
   71 #include <sys/vmmeter.h>
   72 
   73 #include <security/mac/mac_framework.h>
   74 
   75 #include <vm/vm.h>
   76 #include <vm/vm_param.h>
   77 #include <vm/pmap.h>
   78 #include <vm/vm_map.h>
   79 #include <vm/vm_object.h>
   80 #include <vm/vm_page.h>
   81 #include <vm/vm_pager.h>
   82 #include <vm/vm_pageout.h>
   83 #include <vm/vm_extern.h>
   84 #include <vm/vm_page.h>
   85 #include <vm/vnode_pager.h>
   86 
   87 #ifdef HWPMC_HOOKS
   88 #include <sys/pmckern.h>
   89 #endif
   90 
   91 int old_mlock = 1;
   92 SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RW | CTLFLAG_TUN, &old_mlock, 0,
   93     "Do not apply RLIMIT_MEMLOCK on mlockall");
   94 TUNABLE_INT("vm.old_mlock", &old_mlock);
   95 
   96 #ifndef _SYS_SYSPROTO_H_
   97 struct sbrk_args {
   98         int incr;
   99 };
  100 #endif
  101 
  102 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  103     int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
  104 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  105     int *, struct cdev *, vm_ooffset_t *, vm_object_t *);
  106 static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  107     int *, struct shmfd *, vm_ooffset_t, vm_object_t *);
  108 
  109 /*
  110  * MPSAFE
  111  */
  112 /* ARGSUSED */
  113 int
  114 sys_sbrk(td, uap)
  115         struct thread *td;
  116         struct sbrk_args *uap;
  117 {
  118         /* Not yet implemented */
  119         return (EOPNOTSUPP);
  120 }
  121 
  122 #ifndef _SYS_SYSPROTO_H_
  123 struct sstk_args {
  124         int incr;
  125 };
  126 #endif
  127 
  128 /*
  129  * MPSAFE
  130  */
  131 /* ARGSUSED */
  132 int
  133 sys_sstk(td, uap)
  134         struct thread *td;
  135         struct sstk_args *uap;
  136 {
  137         /* Not yet implemented */
  138         return (EOPNOTSUPP);
  139 }
  140 
  141 #if defined(COMPAT_43)
  142 #ifndef _SYS_SYSPROTO_H_
  143 struct getpagesize_args {
  144         int dummy;
  145 };
  146 #endif
  147 
  148 /* ARGSUSED */
  149 int
  150 ogetpagesize(td, uap)
  151         struct thread *td;
  152         struct getpagesize_args *uap;
  153 {
  154         /* MP SAFE */
  155         td->td_retval[0] = PAGE_SIZE;
  156         return (0);
  157 }
  158 #endif                          /* COMPAT_43 */
  159 
  160 
  161 /*
  162  * Memory Map (mmap) system call.  Note that the file offset
  163  * and address are allowed to be NOT page aligned, though if
  164  * the MAP_FIXED flag it set, both must have the same remainder
  165  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  166  * page-aligned, the actual mapping starts at trunc_page(addr)
  167  * and the return value is adjusted up by the page offset.
  168  *
  169  * Generally speaking, only character devices which are themselves
  170  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  171  * there would be no cache coherency between a descriptor and a VM mapping
  172  * both to the same character device.
  173  */
  174 #ifndef _SYS_SYSPROTO_H_
  175 struct mmap_args {
  176         void *addr;
  177         size_t len;
  178         int prot;
  179         int flags;
  180         int fd;
  181         long pad;
  182         off_t pos;
  183 };
  184 #endif
  185 
  186 /*
  187  * MPSAFE
  188  */
  189 int
  190 sys_mmap(td, uap)
  191         struct thread *td;
  192         struct mmap_args *uap;
  193 {
  194 #ifdef HWPMC_HOOKS
  195         struct pmckern_map_in pkm;
  196 #endif
  197         struct file *fp;
  198         struct vnode *vp;
  199         vm_offset_t addr;
  200         vm_size_t size, pageoff;
  201         vm_prot_t cap_maxprot, prot, maxprot;
  202         void *handle;
  203         objtype_t handle_type;
  204         int flags, error;
  205         off_t pos;
  206         struct vmspace *vms = td->td_proc->p_vmspace;
  207         cap_rights_t rights;
  208 
  209         addr = (vm_offset_t) uap->addr;
  210         size = uap->len;
  211         prot = uap->prot & VM_PROT_ALL;
  212         flags = uap->flags;
  213         pos = uap->pos;
  214 
  215         fp = NULL;
  216 
  217         /*
  218          * Enforce the constraints.
  219          * Mapping of length 0 is only allowed for old binaries.
  220          * Anonymous mapping shall specify -1 as filedescriptor and
  221          * zero position for new code. Be nice to ancient a.out
  222          * binaries and correct pos for anonymous mapping, since old
  223          * ld.so sometimes issues anonymous map requests with non-zero
  224          * pos.
  225          */
  226         if (!SV_CURPROC_FLAG(SV_AOUT)) {
  227                 if ((uap->len == 0 && curproc->p_osrel >= P_OSREL_MAP_ANON) ||
  228                     ((flags & MAP_ANON) != 0 && (uap->fd != -1 || pos != 0)))
  229                         return (EINVAL);
  230         } else {
  231                 if ((flags & MAP_ANON) != 0)
  232                         pos = 0;
  233         }
  234 
  235         if (flags & MAP_STACK) {
  236                 if ((uap->fd != -1) ||
  237                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  238                         return (EINVAL);
  239                 flags |= MAP_ANON;
  240                 pos = 0;
  241         }
  242 
  243         /*
  244          * Align the file position to a page boundary,
  245          * and save its page offset component.
  246          */
  247         pageoff = (pos & PAGE_MASK);
  248         pos -= pageoff;
  249 
  250         /* Adjust size for rounding (on both ends). */
  251         size += pageoff;                        /* low end... */
  252         size = (vm_size_t) round_page(size);    /* hi end */
  253 
  254         /*
  255          * Check for illegal addresses.  Watch out for address wrap... Note
  256          * that VM_*_ADDRESS are not constants due to casts (argh).
  257          */
  258         if (flags & MAP_FIXED) {
  259                 /*
  260                  * The specified address must have the same remainder
  261                  * as the file offset taken modulo PAGE_SIZE, so it
  262                  * should be aligned after adjustment by pageoff.
  263                  */
  264                 addr -= pageoff;
  265                 if (addr & PAGE_MASK)
  266                         return (EINVAL);
  267 
  268                 /* Address range must be all in user VM space. */
  269                 if (addr < vm_map_min(&vms->vm_map) ||
  270                     addr + size > vm_map_max(&vms->vm_map))
  271                         return (EINVAL);
  272                 if (addr + size < addr)
  273                         return (EINVAL);
  274         } else {
  275                 /*
  276                  * XXX for non-fixed mappings where no hint is provided or
  277                  * the hint would fall in the potential heap space,
  278                  * place it after the end of the largest possible heap.
  279                  *
  280                  * There should really be a pmap call to determine a reasonable
  281                  * location.
  282                  */
  283                 PROC_LOCK(td->td_proc);
  284                 if (addr == 0 ||
  285                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  286                     addr < round_page((vm_offset_t)vms->vm_daddr +
  287                     lim_max(td->td_proc, RLIMIT_DATA))))
  288                         addr = round_page((vm_offset_t)vms->vm_daddr +
  289                             lim_max(td->td_proc, RLIMIT_DATA));
  290                 PROC_UNLOCK(td->td_proc);
  291         }
  292         if (flags & MAP_ANON) {
  293                 /*
  294                  * Mapping blank space is trivial.
  295                  */
  296                 handle = NULL;
  297                 handle_type = OBJT_DEFAULT;
  298                 maxprot = VM_PROT_ALL;
  299                 cap_maxprot = VM_PROT_ALL;
  300         } else {
  301                 /*
  302                  * Mapping file, get fp for validation and don't let the
  303                  * descriptor disappear on us if we block. Check capability
  304                  * rights, but also return the maximum rights to be combined
  305                  * with maxprot later.
  306                  */
  307                 rights = CAP_MMAP;
  308                 if (prot & PROT_READ)
  309                         rights |= CAP_READ;
  310                 if ((flags & MAP_SHARED) != 0) {
  311                         if (prot & PROT_WRITE)
  312                                 rights |= CAP_WRITE;
  313                 }
  314                 if (prot & PROT_EXEC)
  315                         rights |= CAP_MAPEXEC;
  316                 if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot,
  317                     &fp)) != 0)
  318                         goto done;
  319                 if (fp->f_type == DTYPE_SHM) {
  320                         handle = fp->f_data;
  321                         handle_type = OBJT_SWAP;
  322                         maxprot = VM_PROT_NONE;
  323 
  324                         /* FREAD should always be set. */
  325                         if (fp->f_flag & FREAD)
  326                                 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
  327                         if (fp->f_flag & FWRITE)
  328                                 maxprot |= VM_PROT_WRITE;
  329                         goto map;
  330                 }
  331                 if (fp->f_type != DTYPE_VNODE) {
  332                         error = ENODEV;
  333                         goto done;
  334                 }
  335 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
  336     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
  337                 /*
  338                  * POSIX shared-memory objects are defined to have
  339                  * kernel persistence, and are not defined to support
  340                  * read(2)/write(2) -- or even open(2).  Thus, we can
  341                  * use MAP_ASYNC to trade on-disk coherence for speed.
  342                  * The shm_open(3) library routine turns on the FPOSIXSHM
  343                  * flag to request this behavior.
  344                  */
  345                 if (fp->f_flag & FPOSIXSHM)
  346                         flags |= MAP_NOSYNC;
  347 #endif
  348                 vp = fp->f_vnode;
  349                 /*
  350                  * Ensure that file and memory protections are
  351                  * compatible.  Note that we only worry about
  352                  * writability if mapping is shared; in this case,
  353                  * current and max prot are dictated by the open file.
  354                  * XXX use the vnode instead?  Problem is: what
  355                  * credentials do we use for determination? What if
  356                  * proc does a setuid?
  357                  */
  358                 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
  359                         maxprot = VM_PROT_NONE;
  360                 else
  361                         maxprot = VM_PROT_EXECUTE;
  362                 if (fp->f_flag & FREAD) {
  363                         maxprot |= VM_PROT_READ;
  364                 } else if (prot & PROT_READ) {
  365                         error = EACCES;
  366                         goto done;
  367                 }
  368                 /*
  369                  * If we are sharing potential changes (either via
  370                  * MAP_SHARED or via the implicit sharing of character
  371                  * device mappings), and we are trying to get write
  372                  * permission although we opened it without asking
  373                  * for it, bail out.
  374                  */
  375                 if ((flags & MAP_SHARED) != 0) {
  376                         if ((fp->f_flag & FWRITE) != 0) {
  377                                 maxprot |= VM_PROT_WRITE;
  378                         } else if ((prot & PROT_WRITE) != 0) {
  379                                 error = EACCES;
  380                                 goto done;
  381                         }
  382                 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
  383                         maxprot |= VM_PROT_WRITE;
  384                         cap_maxprot |= VM_PROT_WRITE;
  385                 }
  386                 handle = (void *)vp;
  387                 handle_type = OBJT_VNODE;
  388         }
  389 map:
  390         td->td_fpop = fp;
  391         maxprot &= cap_maxprot;
  392         error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  393             flags, handle_type, handle, pos);
  394         td->td_fpop = NULL;
  395 #ifdef HWPMC_HOOKS
  396         /* inform hwpmc(4) if an executable is being mapped */
  397         if (error == 0 && handle_type == OBJT_VNODE &&
  398             (prot & PROT_EXEC)) {
  399                 pkm.pm_file = handle;
  400                 pkm.pm_address = (uintptr_t) addr;
  401                 PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
  402         }
  403 #endif
  404         if (error == 0)
  405                 td->td_retval[0] = (register_t) (addr + pageoff);
  406 done:
  407         if (fp)
  408                 fdrop(fp, td);
  409 
  410         return (error);
  411 }
  412 
  413 int
  414 freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap)
  415 {
  416         struct mmap_args oargs;
  417 
  418         oargs.addr = uap->addr;
  419         oargs.len = uap->len;
  420         oargs.prot = uap->prot;
  421         oargs.flags = uap->flags;
  422         oargs.fd = uap->fd;
  423         oargs.pos = uap->pos;
  424         return (sys_mmap(td, &oargs));
  425 }
  426 
  427 #ifdef COMPAT_43
  428 #ifndef _SYS_SYSPROTO_H_
  429 struct ommap_args {
  430         caddr_t addr;
  431         int len;
  432         int prot;
  433         int flags;
  434         int fd;
  435         long pos;
  436 };
  437 #endif
  438 int
  439 ommap(td, uap)
  440         struct thread *td;
  441         struct ommap_args *uap;
  442 {
  443         struct mmap_args nargs;
  444         static const char cvtbsdprot[8] = {
  445                 0,
  446                 PROT_EXEC,
  447                 PROT_WRITE,
  448                 PROT_EXEC | PROT_WRITE,
  449                 PROT_READ,
  450                 PROT_EXEC | PROT_READ,
  451                 PROT_WRITE | PROT_READ,
  452                 PROT_EXEC | PROT_WRITE | PROT_READ,
  453         };
  454 
  455 #define OMAP_ANON       0x0002
  456 #define OMAP_COPY       0x0020
  457 #define OMAP_SHARED     0x0010
  458 #define OMAP_FIXED      0x0100
  459 
  460         nargs.addr = uap->addr;
  461         nargs.len = uap->len;
  462         nargs.prot = cvtbsdprot[uap->prot & 0x7];
  463 #ifdef COMPAT_FREEBSD32
  464 #if defined(__amd64__) || defined(__ia64__)
  465         if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) &&
  466             nargs.prot != 0)
  467                 nargs.prot |= PROT_EXEC;
  468 #endif
  469 #endif
  470         nargs.flags = 0;
  471         if (uap->flags & OMAP_ANON)
  472                 nargs.flags |= MAP_ANON;
  473         if (uap->flags & OMAP_COPY)
  474                 nargs.flags |= MAP_COPY;
  475         if (uap->flags & OMAP_SHARED)
  476                 nargs.flags |= MAP_SHARED;
  477         else
  478                 nargs.flags |= MAP_PRIVATE;
  479         if (uap->flags & OMAP_FIXED)
  480                 nargs.flags |= MAP_FIXED;
  481         nargs.fd = uap->fd;
  482         nargs.pos = uap->pos;
  483         return (sys_mmap(td, &nargs));
  484 }
  485 #endif                          /* COMPAT_43 */
  486 
  487 
  488 #ifndef _SYS_SYSPROTO_H_
  489 struct msync_args {
  490         void *addr;
  491         size_t len;
  492         int flags;
  493 };
  494 #endif
  495 /*
  496  * MPSAFE
  497  */
  498 int
  499 sys_msync(td, uap)
  500         struct thread *td;
  501         struct msync_args *uap;
  502 {
  503         vm_offset_t addr;
  504         vm_size_t size, pageoff;
  505         int flags;
  506         vm_map_t map;
  507         int rv;
  508 
  509         addr = (vm_offset_t) uap->addr;
  510         size = uap->len;
  511         flags = uap->flags;
  512 
  513         pageoff = (addr & PAGE_MASK);
  514         addr -= pageoff;
  515         size += pageoff;
  516         size = (vm_size_t) round_page(size);
  517         if (addr + size < addr)
  518                 return (EINVAL);
  519 
  520         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  521                 return (EINVAL);
  522 
  523         map = &td->td_proc->p_vmspace->vm_map;
  524 
  525         /*
  526          * Clean the pages and interpret the return value.
  527          */
  528         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  529             (flags & MS_INVALIDATE) != 0);
  530         switch (rv) {
  531         case KERN_SUCCESS:
  532                 return (0);
  533         case KERN_INVALID_ADDRESS:
  534                 return (EINVAL);        /* Sun returns ENOMEM? */
  535         case KERN_INVALID_ARGUMENT:
  536                 return (EBUSY);
  537         case KERN_FAILURE:
  538                 return (EIO);
  539         default:
  540                 return (EINVAL);
  541         }
  542 }
  543 
  544 #ifndef _SYS_SYSPROTO_H_
  545 struct munmap_args {
  546         void *addr;
  547         size_t len;
  548 };
  549 #endif
  550 /*
  551  * MPSAFE
  552  */
  553 int
  554 sys_munmap(td, uap)
  555         struct thread *td;
  556         struct munmap_args *uap;
  557 {
  558 #ifdef HWPMC_HOOKS
  559         struct pmckern_map_out pkm;
  560         vm_map_entry_t entry;
  561 #endif
  562         vm_offset_t addr;
  563         vm_size_t size, pageoff;
  564         vm_map_t map;
  565 
  566         addr = (vm_offset_t) uap->addr;
  567         size = uap->len;
  568         if (size == 0)
  569                 return (EINVAL);
  570 
  571         pageoff = (addr & PAGE_MASK);
  572         addr -= pageoff;
  573         size += pageoff;
  574         size = (vm_size_t) round_page(size);
  575         if (addr + size < addr)
  576                 return (EINVAL);
  577 
  578         /*
  579          * Check for illegal addresses.  Watch out for address wrap...
  580          */
  581         map = &td->td_proc->p_vmspace->vm_map;
  582         if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
  583                 return (EINVAL);
  584         vm_map_lock(map);
  585 #ifdef HWPMC_HOOKS
  586         /*
  587          * Inform hwpmc if the address range being unmapped contains
  588          * an executable region.
  589          */
  590         pkm.pm_address = (uintptr_t) NULL;
  591         if (vm_map_lookup_entry(map, addr, &entry)) {
  592                 for (;
  593                      entry != &map->header && entry->start < addr + size;
  594                      entry = entry->next) {
  595                         if (vm_map_check_protection(map, entry->start,
  596                                 entry->end, VM_PROT_EXECUTE) == TRUE) {
  597                                 pkm.pm_address = (uintptr_t) addr;
  598                                 pkm.pm_size = (size_t) size;
  599                                 break;
  600                         }
  601                 }
  602         }
  603 #endif
  604         vm_map_delete(map, addr, addr + size);
  605 
  606 #ifdef HWPMC_HOOKS
  607         /* downgrade the lock to prevent a LOR with the pmc-sx lock */
  608         vm_map_lock_downgrade(map);
  609         if (pkm.pm_address != (uintptr_t) NULL)
  610                 PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
  611         vm_map_unlock_read(map);
  612 #else
  613         vm_map_unlock(map);
  614 #endif
  615         /* vm_map_delete returns nothing but KERN_SUCCESS anyway */
  616         return (0);
  617 }
  618 
  619 #ifndef _SYS_SYSPROTO_H_
  620 struct mprotect_args {
  621         const void *addr;
  622         size_t len;
  623         int prot;
  624 };
  625 #endif
  626 /*
  627  * MPSAFE
  628  */
  629 int
  630 sys_mprotect(td, uap)
  631         struct thread *td;
  632         struct mprotect_args *uap;
  633 {
  634         vm_offset_t addr;
  635         vm_size_t size, pageoff;
  636         vm_prot_t prot;
  637 
  638         addr = (vm_offset_t) uap->addr;
  639         size = uap->len;
  640         prot = uap->prot & VM_PROT_ALL;
  641 
  642         pageoff = (addr & PAGE_MASK);
  643         addr -= pageoff;
  644         size += pageoff;
  645         size = (vm_size_t) round_page(size);
  646         if (addr + size < addr)
  647                 return (EINVAL);
  648 
  649         switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
  650             addr + size, prot, FALSE)) {
  651         case KERN_SUCCESS:
  652                 return (0);
  653         case KERN_PROTECTION_FAILURE:
  654                 return (EACCES);
  655         case KERN_RESOURCE_SHORTAGE:
  656                 return (ENOMEM);
  657         }
  658         return (EINVAL);
  659 }
  660 
  661 #ifndef _SYS_SYSPROTO_H_
  662 struct minherit_args {
  663         void *addr;
  664         size_t len;
  665         int inherit;
  666 };
  667 #endif
  668 /*
  669  * MPSAFE
  670  */
  671 int
  672 sys_minherit(td, uap)
  673         struct thread *td;
  674         struct minherit_args *uap;
  675 {
  676         vm_offset_t addr;
  677         vm_size_t size, pageoff;
  678         vm_inherit_t inherit;
  679 
  680         addr = (vm_offset_t)uap->addr;
  681         size = uap->len;
  682         inherit = uap->inherit;
  683 
  684         pageoff = (addr & PAGE_MASK);
  685         addr -= pageoff;
  686         size += pageoff;
  687         size = (vm_size_t) round_page(size);
  688         if (addr + size < addr)
  689                 return (EINVAL);
  690 
  691         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  692             addr + size, inherit)) {
  693         case KERN_SUCCESS:
  694                 return (0);
  695         case KERN_PROTECTION_FAILURE:
  696                 return (EACCES);
  697         }
  698         return (EINVAL);
  699 }
  700 
  701 #ifndef _SYS_SYSPROTO_H_
  702 struct madvise_args {
  703         void *addr;
  704         size_t len;
  705         int behav;
  706 };
  707 #endif
  708 
  709 /*
  710  * MPSAFE
  711  */
  712 /* ARGSUSED */
  713 int
  714 sys_madvise(td, uap)
  715         struct thread *td;
  716         struct madvise_args *uap;
  717 {
  718         vm_offset_t start, end;
  719         vm_map_t map;
  720         struct proc *p;
  721         int error;
  722 
  723         /*
  724          * Check for our special case, advising the swap pager we are
  725          * "immortal."
  726          */
  727         if (uap->behav == MADV_PROTECT) {
  728                 error = priv_check(td, PRIV_VM_MADV_PROTECT);
  729                 if (error == 0) {
  730                         p = td->td_proc;
  731                         PROC_LOCK(p);
  732                         p->p_flag |= P_PROTECTED;
  733                         PROC_UNLOCK(p);
  734                 }
  735                 return (error);
  736         }
  737         /*
  738          * Check for illegal behavior
  739          */
  740         if (uap->behav < 0 || uap->behav > MADV_CORE)
  741                 return (EINVAL);
  742         /*
  743          * Check for illegal addresses.  Watch out for address wrap... Note
  744          * that VM_*_ADDRESS are not constants due to casts (argh).
  745          */
  746         map = &td->td_proc->p_vmspace->vm_map;
  747         if ((vm_offset_t)uap->addr < vm_map_min(map) ||
  748             (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
  749                 return (EINVAL);
  750         if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
  751                 return (EINVAL);
  752 
  753         /*
  754          * Since this routine is only advisory, we default to conservative
  755          * behavior.
  756          */
  757         start = trunc_page((vm_offset_t) uap->addr);
  758         end = round_page((vm_offset_t) uap->addr + uap->len);
  759 
  760         if (vm_map_madvise(map, start, end, uap->behav))
  761                 return (EINVAL);
  762         return (0);
  763 }
  764 
  765 #ifndef _SYS_SYSPROTO_H_
  766 struct mincore_args {
  767         const void *addr;
  768         size_t len;
  769         char *vec;
  770 };
  771 #endif
  772 
  773 /*
  774  * MPSAFE
  775  */
  776 /* ARGSUSED */
  777 int
  778 sys_mincore(td, uap)
  779         struct thread *td;
  780         struct mincore_args *uap;
  781 {
  782         vm_offset_t addr, first_addr;
  783         vm_offset_t end, cend;
  784         pmap_t pmap;
  785         vm_map_t map;
  786         char *vec;
  787         int error = 0;
  788         int vecindex, lastvecindex;
  789         vm_map_entry_t current;
  790         vm_map_entry_t entry;
  791         vm_object_t object;
  792         vm_paddr_t locked_pa;
  793         vm_page_t m;
  794         vm_pindex_t pindex;
  795         int mincoreinfo;
  796         unsigned int timestamp;
  797         boolean_t locked;
  798 
  799         /*
  800          * Make sure that the addresses presented are valid for user
  801          * mode.
  802          */
  803         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  804         end = addr + (vm_size_t)round_page(uap->len);
  805         map = &td->td_proc->p_vmspace->vm_map;
  806         if (end > vm_map_max(map) || end < addr)
  807                 return (ENOMEM);
  808 
  809         /*
  810          * Address of byte vector
  811          */
  812         vec = uap->vec;
  813 
  814         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  815 
  816         vm_map_lock_read(map);
  817 RestartScan:
  818         timestamp = map->timestamp;
  819 
  820         if (!vm_map_lookup_entry(map, addr, &entry)) {
  821                 vm_map_unlock_read(map);
  822                 return (ENOMEM);
  823         }
  824 
  825         /*
  826          * Do this on a map entry basis so that if the pages are not
  827          * in the current processes address space, we can easily look
  828          * up the pages elsewhere.
  829          */
  830         lastvecindex = -1;
  831         for (current = entry;
  832             (current != &map->header) && (current->start < end);
  833             current = current->next) {
  834 
  835                 /*
  836                  * check for contiguity
  837                  */
  838                 if (current->end < end &&
  839                     (entry->next == &map->header ||
  840                      current->next->start > current->end)) {
  841                         vm_map_unlock_read(map);
  842                         return (ENOMEM);
  843                 }
  844 
  845                 /*
  846                  * ignore submaps (for now) or null objects
  847                  */
  848                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  849                         current->object.vm_object == NULL)
  850                         continue;
  851 
  852                 /*
  853                  * limit this scan to the current map entry and the
  854                  * limits for the mincore call
  855                  */
  856                 if (addr < current->start)
  857                         addr = current->start;
  858                 cend = current->end;
  859                 if (cend > end)
  860                         cend = end;
  861 
  862                 /*
  863                  * scan this entry one page at a time
  864                  */
  865                 while (addr < cend) {
  866                         /*
  867                          * Check pmap first, it is likely faster, also
  868                          * it can provide info as to whether we are the
  869                          * one referencing or modifying the page.
  870                          */
  871                         object = NULL;
  872                         locked_pa = 0;
  873                 retry:
  874                         m = NULL;
  875                         mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
  876                         if (locked_pa != 0) {
  877                                 /*
  878                                  * The page is mapped by this process but not
  879                                  * both accessed and modified.  It is also
  880                                  * managed.  Acquire the object lock so that
  881                                  * other mappings might be examined.
  882                                  */
  883                                 m = PHYS_TO_VM_PAGE(locked_pa);
  884                                 if (m->object != object) {
  885                                         if (object != NULL)
  886                                                 VM_OBJECT_UNLOCK(object);
  887                                         object = m->object;
  888                                         locked = VM_OBJECT_TRYLOCK(object);
  889                                         vm_page_unlock(m);
  890                                         if (!locked) {
  891                                                 VM_OBJECT_LOCK(object);
  892                                                 vm_page_lock(m);
  893                                                 goto retry;
  894                                         }
  895                                 } else
  896                                         vm_page_unlock(m);
  897                                 KASSERT(m->valid == VM_PAGE_BITS_ALL,
  898                                     ("mincore: page %p is mapped but invalid",
  899                                     m));
  900                         } else if (mincoreinfo == 0) {
  901                                 /*
  902                                  * The page is not mapped by this process.  If
  903                                  * the object implements managed pages, then
  904                                  * determine if the page is resident so that
  905                                  * the mappings might be examined.
  906                                  */
  907                                 if (current->object.vm_object != object) {
  908                                         if (object != NULL)
  909                                                 VM_OBJECT_UNLOCK(object);
  910                                         object = current->object.vm_object;
  911                                         VM_OBJECT_LOCK(object);
  912                                 }
  913                                 if (object->type == OBJT_DEFAULT ||
  914                                     object->type == OBJT_SWAP ||
  915                                     object->type == OBJT_VNODE) {
  916                                         pindex = OFF_TO_IDX(current->offset +
  917                                             (addr - current->start));
  918                                         m = vm_page_lookup(object, pindex);
  919                                         if (m == NULL &&
  920                                             vm_page_is_cached(object, pindex))
  921                                                 mincoreinfo = MINCORE_INCORE;
  922                                         if (m != NULL && m->valid == 0)
  923                                                 m = NULL;
  924                                         if (m != NULL)
  925                                                 mincoreinfo = MINCORE_INCORE;
  926                                 }
  927                         }
  928                         if (m != NULL) {
  929                                 /* Examine other mappings to the page. */
  930                                 if (m->dirty == 0 && pmap_is_modified(m))
  931                                         vm_page_dirty(m);
  932                                 if (m->dirty != 0)
  933                                         mincoreinfo |= MINCORE_MODIFIED_OTHER;
  934                                 /*
  935                                  * The first test for PGA_REFERENCED is an
  936                                  * optimization.  The second test is
  937                                  * required because a concurrent pmap
  938                                  * operation could clear the last reference
  939                                  * and set PGA_REFERENCED before the call to
  940                                  * pmap_is_referenced(). 
  941                                  */
  942                                 if ((m->aflags & PGA_REFERENCED) != 0 ||
  943                                     pmap_is_referenced(m) ||
  944                                     (m->aflags & PGA_REFERENCED) != 0)
  945                                         mincoreinfo |= MINCORE_REFERENCED_OTHER;
  946                         }
  947                         if (object != NULL)
  948                                 VM_OBJECT_UNLOCK(object);
  949 
  950                         /*
  951                          * subyte may page fault.  In case it needs to modify
  952                          * the map, we release the lock.
  953                          */
  954                         vm_map_unlock_read(map);
  955 
  956                         /*
  957                          * calculate index into user supplied byte vector
  958                          */
  959                         vecindex = OFF_TO_IDX(addr - first_addr);
  960 
  961                         /*
  962                          * If we have skipped map entries, we need to make sure that
  963                          * the byte vector is zeroed for those skipped entries.
  964                          */
  965                         while ((lastvecindex + 1) < vecindex) {
  966                                 error = subyte(vec + lastvecindex, 0);
  967                                 if (error) {
  968                                         error = EFAULT;
  969                                         goto done2;
  970                                 }
  971                                 ++lastvecindex;
  972                         }
  973 
  974                         /*
  975                          * Pass the page information to the user
  976                          */
  977                         error = subyte(vec + vecindex, mincoreinfo);
  978                         if (error) {
  979                                 error = EFAULT;
  980                                 goto done2;
  981                         }
  982 
  983                         /*
  984                          * If the map has changed, due to the subyte, the previous
  985                          * output may be invalid.
  986                          */
  987                         vm_map_lock_read(map);
  988                         if (timestamp != map->timestamp)
  989                                 goto RestartScan;
  990 
  991                         lastvecindex = vecindex;
  992                         addr += PAGE_SIZE;
  993                 }
  994         }
  995 
  996         /*
  997          * subyte may page fault.  In case it needs to modify
  998          * the map, we release the lock.
  999          */
 1000         vm_map_unlock_read(map);
 1001 
 1002         /*
 1003          * Zero the last entries in the byte vector.
 1004          */
 1005         vecindex = OFF_TO_IDX(end - first_addr);
 1006         while ((lastvecindex + 1) < vecindex) {
 1007                 error = subyte(vec + lastvecindex, 0);
 1008                 if (error) {
 1009                         error = EFAULT;
 1010                         goto done2;
 1011                 }
 1012                 ++lastvecindex;
 1013         }
 1014 
 1015         /*
 1016          * If the map has changed, due to the subyte, the previous
 1017          * output may be invalid.
 1018          */
 1019         vm_map_lock_read(map);
 1020         if (timestamp != map->timestamp)
 1021                 goto RestartScan;
 1022         vm_map_unlock_read(map);
 1023 done2:
 1024         return (error);
 1025 }
 1026 
 1027 #ifndef _SYS_SYSPROTO_H_
 1028 struct mlock_args {
 1029         const void *addr;
 1030         size_t len;
 1031 };
 1032 #endif
 1033 /*
 1034  * MPSAFE
 1035  */
 1036 int
 1037 sys_mlock(td, uap)
 1038         struct thread *td;
 1039         struct mlock_args *uap;
 1040 {
 1041         struct proc *proc;
 1042         vm_offset_t addr, end, last, start;
 1043         vm_size_t npages, size;
 1044         vm_map_t map;
 1045         unsigned long nsize;
 1046         int error;
 1047 
 1048         error = priv_check(td, PRIV_VM_MLOCK);
 1049         if (error)
 1050                 return (error);
 1051         addr = (vm_offset_t)uap->addr;
 1052         size = uap->len;
 1053         last = addr + size;
 1054         start = trunc_page(addr);
 1055         end = round_page(last);
 1056         if (last < addr || end < addr)
 1057                 return (EINVAL);
 1058         npages = atop(end - start);
 1059         if (npages > vm_page_max_wired)
 1060                 return (ENOMEM);
 1061         proc = td->td_proc;
 1062         map = &proc->p_vmspace->vm_map;
 1063         PROC_LOCK(proc);
 1064         nsize = ptoa(npages + pmap_wired_count(map->pmap));
 1065         if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
 1066                 PROC_UNLOCK(proc);
 1067                 return (ENOMEM);
 1068         }
 1069         PROC_UNLOCK(proc);
 1070         if (npages + cnt.v_wire_count > vm_page_max_wired)
 1071                 return (EAGAIN);
 1072 #ifdef RACCT
 1073         PROC_LOCK(proc);
 1074         error = racct_set(proc, RACCT_MEMLOCK, nsize);
 1075         PROC_UNLOCK(proc);
 1076         if (error != 0)
 1077                 return (ENOMEM);
 1078 #endif
 1079         error = vm_map_wire(map, start, end,
 1080             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1081 #ifdef RACCT
 1082         if (error != KERN_SUCCESS) {
 1083                 PROC_LOCK(proc);
 1084                 racct_set(proc, RACCT_MEMLOCK,
 1085                     ptoa(pmap_wired_count(map->pmap)));
 1086                 PROC_UNLOCK(proc);
 1087         }
 1088 #endif
 1089         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1090 }
 1091 
 1092 #ifndef _SYS_SYSPROTO_H_
 1093 struct mlockall_args {
 1094         int     how;
 1095 };
 1096 #endif
 1097 
 1098 /*
 1099  * MPSAFE
 1100  */
 1101 int
 1102 sys_mlockall(td, uap)
 1103         struct thread *td;
 1104         struct mlockall_args *uap;
 1105 {
 1106         vm_map_t map;
 1107         int error;
 1108 
 1109         map = &td->td_proc->p_vmspace->vm_map;
 1110         error = priv_check(td, PRIV_VM_MLOCK);
 1111         if (error)
 1112                 return (error);
 1113 
 1114         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 1115                 return (EINVAL);
 1116 
 1117         /*
 1118          * If wiring all pages in the process would cause it to exceed
 1119          * a hard resource limit, return ENOMEM.
 1120          */
 1121         if (!old_mlock && uap->how & MCL_CURRENT) {
 1122                 PROC_LOCK(td->td_proc);
 1123                 if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 1124                         PROC_UNLOCK(td->td_proc);
 1125                         return (ENOMEM);
 1126                 }
 1127                 PROC_UNLOCK(td->td_proc);
 1128         }
 1129 #ifdef RACCT
 1130         PROC_LOCK(td->td_proc);
 1131         error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
 1132         PROC_UNLOCK(td->td_proc);
 1133         if (error != 0)
 1134                 return (ENOMEM);
 1135 #endif
 1136 
 1137         if (uap->how & MCL_FUTURE) {
 1138                 vm_map_lock(map);
 1139                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
 1140                 vm_map_unlock(map);
 1141                 error = 0;
 1142         }
 1143 
 1144         if (uap->how & MCL_CURRENT) {
 1145                 /*
 1146                  * P1003.1-2001 mandates that all currently mapped pages
 1147                  * will be memory resident and locked (wired) upon return
 1148                  * from mlockall(). vm_map_wire() will wire pages, by
 1149                  * calling vm_fault_wire() for each page in the region.
 1150                  */
 1151                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 1152                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1153                 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 1154         }
 1155 #ifdef RACCT
 1156         if (error != KERN_SUCCESS) {
 1157                 PROC_LOCK(td->td_proc);
 1158                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1159                     ptoa(pmap_wired_count(map->pmap)));
 1160                 PROC_UNLOCK(td->td_proc);
 1161         }
 1162 #endif
 1163 
 1164         return (error);
 1165 }
 1166 
 1167 #ifndef _SYS_SYSPROTO_H_
 1168 struct munlockall_args {
 1169         register_t dummy;
 1170 };
 1171 #endif
 1172 
 1173 /*
 1174  * MPSAFE
 1175  */
 1176 int
 1177 sys_munlockall(td, uap)
 1178         struct thread *td;
 1179         struct munlockall_args *uap;
 1180 {
 1181         vm_map_t map;
 1182         int error;
 1183 
 1184         map = &td->td_proc->p_vmspace->vm_map;
 1185         error = priv_check(td, PRIV_VM_MUNLOCK);
 1186         if (error)
 1187                 return (error);
 1188 
 1189         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1190         vm_map_lock(map);
 1191         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1192         vm_map_unlock(map);
 1193 
 1194         /* Forcibly unwire all pages. */
 1195         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1196             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1197 #ifdef RACCT
 1198         if (error == KERN_SUCCESS) {
 1199                 PROC_LOCK(td->td_proc);
 1200                 racct_set(td->td_proc, RACCT_MEMLOCK, 0);
 1201                 PROC_UNLOCK(td->td_proc);
 1202         }
 1203 #endif
 1204 
 1205         return (error);
 1206 }
 1207 
 1208 #ifndef _SYS_SYSPROTO_H_
 1209 struct munlock_args {
 1210         const void *addr;
 1211         size_t len;
 1212 };
 1213 #endif
 1214 /*
 1215  * MPSAFE
 1216  */
 1217 int
 1218 sys_munlock(td, uap)
 1219         struct thread *td;
 1220         struct munlock_args *uap;
 1221 {
 1222         vm_offset_t addr, end, last, start;
 1223         vm_size_t size;
 1224 #ifdef RACCT
 1225         vm_map_t map;
 1226 #endif
 1227         int error;
 1228 
 1229         error = priv_check(td, PRIV_VM_MUNLOCK);
 1230         if (error)
 1231                 return (error);
 1232         addr = (vm_offset_t)uap->addr;
 1233         size = uap->len;
 1234         last = addr + size;
 1235         start = trunc_page(addr);
 1236         end = round_page(last);
 1237         if (last < addr || end < addr)
 1238                 return (EINVAL);
 1239         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 1240             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1241 #ifdef RACCT
 1242         if (error == KERN_SUCCESS) {
 1243                 PROC_LOCK(td->td_proc);
 1244                 map = &td->td_proc->p_vmspace->vm_map;
 1245                 racct_set(td->td_proc, RACCT_MEMLOCK,
 1246                     ptoa(pmap_wired_count(map->pmap)));
 1247                 PROC_UNLOCK(td->td_proc);
 1248         }
 1249 #endif
 1250         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1251 }
 1252 
 1253 /*
 1254  * vm_mmap_vnode()
 1255  *
 1256  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1257  * operations on vnodes.
 1258  *
 1259  * For VCHR vnodes, the vnode lock is held over the call to
 1260  * vm_mmap_cdev() to keep vp->v_rdev valid.
 1261  */
 1262 int
 1263 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 1264     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1265     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
 1266     boolean_t *writecounted)
 1267 {
 1268         struct vattr va;
 1269         vm_object_t obj;
 1270         vm_offset_t foff;
 1271         struct mount *mp;
 1272         struct ucred *cred;
 1273         int error, flags, locktype, vfslocked;
 1274 
 1275         mp = vp->v_mount;
 1276         cred = td->td_ucred;
 1277         if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED))
 1278                 locktype = LK_EXCLUSIVE;
 1279         else
 1280                 locktype = LK_SHARED;
 1281         vfslocked = VFS_LOCK_GIANT(mp);
 1282         if ((error = vget(vp, locktype, td)) != 0) {
 1283                 VFS_UNLOCK_GIANT(vfslocked);
 1284                 return (error);
 1285         }
 1286         foff = *foffp;
 1287         flags = *flagsp;
 1288         obj = vp->v_object;
 1289         if (vp->v_type == VREG) {
 1290                 /*
 1291                  * Get the proper underlying object
 1292                  */
 1293                 if (obj == NULL) {
 1294                         error = EINVAL;
 1295                         goto done;
 1296                 }
 1297                 if (obj->handle != vp) {
 1298                         vput(vp);
 1299                         vp = (struct vnode *)obj->handle;
 1300                         /*
 1301                          * Bypass filesystems obey the mpsafety of the
 1302                          * underlying fs.
 1303                          */
 1304                         error = vget(vp, locktype, td);
 1305                         if (error != 0) {
 1306                                 VFS_UNLOCK_GIANT(vfslocked);
 1307                                 return (error);
 1308                         }
 1309                 }
 1310                 if (locktype == LK_EXCLUSIVE) {
 1311                         *writecounted = TRUE;
 1312                         vnode_pager_update_writecount(obj, 0, objsize);
 1313                 }
 1314         } else if (vp->v_type == VCHR) {
 1315                 error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
 1316                     vp->v_rdev, foffp, objp);
 1317                 if (error == 0)
 1318                         goto mark_atime;
 1319                 goto done;
 1320         } else {
 1321                 error = EINVAL;
 1322                 goto done;
 1323         }
 1324         if ((error = VOP_GETATTR(vp, &va, cred)))
 1325                 goto done;
 1326 #ifdef MAC
 1327         error = mac_vnode_check_mmap(cred, vp, prot, flags);
 1328         if (error != 0)
 1329                 goto done;
 1330 #endif
 1331         if ((flags & MAP_SHARED) != 0) {
 1332                 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 1333                         if (prot & PROT_WRITE) {
 1334                                 error = EPERM;
 1335                                 goto done;
 1336                         }
 1337                         *maxprotp &= ~VM_PROT_WRITE;
 1338                 }
 1339         }
 1340         /*
 1341          * If it is a regular file without any references
 1342          * we do not need to sync it.
 1343          * Adjust object size to be the size of actual file.
 1344          */
 1345         objsize = round_page(va.va_size);
 1346         if (va.va_nlink == 0)
 1347                 flags |= MAP_NOSYNC;
 1348         obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred);
 1349         if (obj == NULL) {
 1350                 error = ENOMEM;
 1351                 goto done;
 1352         }
 1353         *objp = obj;
 1354         *flagsp = flags;
 1355 
 1356 mark_atime:
 1357         vfs_mark_atime(vp, cred);
 1358 
 1359 done:
 1360         if (error != 0 && *writecounted) {
 1361                 *writecounted = FALSE;
 1362                 vnode_pager_update_writecount(obj, objsize, 0);
 1363         }
 1364         vput(vp);
 1365         VFS_UNLOCK_GIANT(vfslocked);
 1366         return (error);
 1367 }
 1368 
 1369 /*
 1370  * vm_mmap_cdev()
 1371  *
 1372  * MPSAFE
 1373  *
 1374  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1375  * operations on cdevs.
 1376  */
 1377 int
 1378 vm_mmap_cdev(struct thread *td, vm_size_t objsize,
 1379     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1380     struct cdev *cdev, vm_ooffset_t *foff, vm_object_t *objp)
 1381 {
 1382         vm_object_t obj;
 1383         struct cdevsw *dsw;
 1384         int error, flags, ref;
 1385 
 1386         flags = *flagsp;
 1387 
 1388         dsw = dev_refthread(cdev, &ref);
 1389         if (dsw == NULL)
 1390                 return (ENXIO);
 1391         if (dsw->d_flags & D_MMAP_ANON) {
 1392                 dev_relthread(cdev, ref);
 1393                 *maxprotp = VM_PROT_ALL;
 1394                 *flagsp |= MAP_ANON;
 1395                 return (0);
 1396         }
 1397         /*
 1398          * cdevs do not provide private mappings of any kind.
 1399          */
 1400         if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 1401             (prot & PROT_WRITE) != 0) {
 1402                 dev_relthread(cdev, ref);
 1403                 return (EACCES);
 1404         }
 1405         if (flags & (MAP_PRIVATE|MAP_COPY)) {
 1406                 dev_relthread(cdev, ref);
 1407                 return (EINVAL);
 1408         }
 1409         /*
 1410          * Force device mappings to be shared.
 1411          */
 1412         flags |= MAP_SHARED;
 1413 #ifdef MAC_XXX
 1414         error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
 1415         if (error != 0) {
 1416                 dev_relthread(cdev, ref);
 1417                 return (error);
 1418         }
 1419 #endif
 1420         /*
 1421          * First, try d_mmap_single().  If that is not implemented
 1422          * (returns ENODEV), fall back to using the device pager.
 1423          * Note that d_mmap_single() must return a reference to the
 1424          * object (it needs to bump the reference count of the object
 1425          * it returns somehow).
 1426          *
 1427          * XXX assumes VM_PROT_* == PROT_*
 1428          */
 1429         error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
 1430         dev_relthread(cdev, ref);
 1431         if (error != ENODEV)
 1432                 return (error);
 1433         obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
 1434             td->td_ucred);
 1435         if (obj == NULL)
 1436                 return (EINVAL);
 1437         *objp = obj;
 1438         *flagsp = flags;
 1439         return (0);
 1440 }
 1441 
 1442 /*
 1443  * vm_mmap_shm()
 1444  *
 1445  * MPSAFE
 1446  *
 1447  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1448  * operations on shm file descriptors.
 1449  */
 1450 int
 1451 vm_mmap_shm(struct thread *td, vm_size_t objsize,
 1452     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1453     struct shmfd *shmfd, vm_ooffset_t foff, vm_object_t *objp)
 1454 {
 1455         int error;
 1456 
 1457         if ((*flagsp & MAP_SHARED) != 0 &&
 1458             (*maxprotp & VM_PROT_WRITE) == 0 &&
 1459             (prot & PROT_WRITE) != 0)
 1460                 return (EACCES);
 1461 #ifdef MAC
 1462         error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
 1463         if (error != 0)
 1464                 return (error);
 1465 #endif
 1466         error = shm_mmap(shmfd, objsize, foff, objp);
 1467         if (error)
 1468                 return (error);
 1469         return (0);
 1470 }
 1471 
 1472 /*
 1473  * vm_mmap()
 1474  *
 1475  * MPSAFE
 1476  *
 1477  * Internal version of mmap.  Currently used by mmap, exec, and sys5
 1478  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
 1479  */
 1480 int
 1481 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1482         vm_prot_t maxprot, int flags,
 1483         objtype_t handle_type, void *handle,
 1484         vm_ooffset_t foff)
 1485 {
 1486         boolean_t fitit;
 1487         vm_object_t object = NULL;
 1488         struct thread *td = curthread;
 1489         int docow, error, rv;
 1490         boolean_t writecounted;
 1491 
 1492         if (size == 0)
 1493                 return (0);
 1494 
 1495         size = round_page(size);
 1496 
 1497         PROC_LOCK(td->td_proc);
 1498         if (td->td_proc->p_vmspace->vm_map.size + size >
 1499             lim_cur(td->td_proc, RLIMIT_VMEM)) {
 1500                 if (!old_mlock && map->flags & MAP_WIREFUTURE) {
 1501                         if (ptoa(pmap_wired_count(map->pmap)) + size >
 1502                             lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
 1503                                 racct_set_force(td->td_proc, RACCT_VMEM,
 1504                                     map->size);
 1505                                 PROC_UNLOCK(td->td_proc);
 1506                                 return (ENOMEM);
 1507                         }
 1508                         error = racct_set(td->td_proc, RACCT_MEMLOCK,
 1509                             ptoa(pmap_wired_count(map->pmap)) + size);
 1510                         if (error != 0) {
 1511                                 racct_set_force(td->td_proc, RACCT_VMEM,
 1512                                     map->size);
 1513                                 PROC_UNLOCK(td->td_proc);
 1514                                 return (error);
 1515                         }
 1516                 }
 1517                 PROC_UNLOCK(td->td_proc);
 1518                 return (ENOMEM);
 1519         }
 1520         if (racct_set(td->td_proc, RACCT_VMEM,
 1521             td->td_proc->p_vmspace->vm_map.size + size)) {
 1522                 PROC_UNLOCK(td->td_proc);
 1523                 return (ENOMEM);
 1524         }
 1525         PROC_UNLOCK(td->td_proc);
 1526 
 1527         /*
 1528          * We currently can only deal with page aligned file offsets.
 1529          * The check is here rather than in the syscall because the
 1530          * kernel calls this function internally for other mmaping
 1531          * operations (such as in exec) and non-aligned offsets will
 1532          * cause pmap inconsistencies...so we want to be sure to
 1533          * disallow this in all cases.
 1534          */
 1535         if (foff & PAGE_MASK)
 1536                 return (EINVAL);
 1537 
 1538         if ((flags & MAP_FIXED) == 0) {
 1539                 fitit = TRUE;
 1540                 *addr = round_page(*addr);
 1541         } else {
 1542                 if (*addr != trunc_page(*addr))
 1543                         return (EINVAL);
 1544                 fitit = FALSE;
 1545         }
 1546         writecounted = FALSE;
 1547 
 1548         /*
 1549          * Lookup/allocate object.
 1550          */
 1551         switch (handle_type) {
 1552         case OBJT_DEVICE:
 1553                 error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
 1554                     handle, &foff, &object);
 1555                 break;
 1556         case OBJT_VNODE:
 1557                 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 1558                     handle, &foff, &object, &writecounted);
 1559                 break;
 1560         case OBJT_SWAP:
 1561                 error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
 1562                     handle, foff, &object);
 1563                 break;
 1564         case OBJT_DEFAULT:
 1565                 if (handle == NULL) {
 1566                         error = 0;
 1567                         break;
 1568                 }
 1569                 /* FALLTHROUGH */
 1570         default:
 1571                 error = EINVAL;
 1572                 break;
 1573         }
 1574         if (error)
 1575                 return (error);
 1576         if (flags & MAP_ANON) {
 1577                 object = NULL;
 1578                 docow = 0;
 1579                 /*
 1580                  * Unnamed anonymous regions always start at 0.
 1581                  */
 1582                 if (handle == 0)
 1583                         foff = 0;
 1584         } else if (flags & MAP_PREFAULT_READ)
 1585                 docow = MAP_PREFAULT;
 1586         else
 1587                 docow = MAP_PREFAULT_PARTIAL;
 1588 
 1589         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1590                 docow |= MAP_COPY_ON_WRITE;
 1591         if (flags & MAP_NOSYNC)
 1592                 docow |= MAP_DISABLE_SYNCER;
 1593         if (flags & MAP_NOCORE)
 1594                 docow |= MAP_DISABLE_COREDUMP;
 1595         /* Shared memory is also shared with children. */
 1596         if (flags & MAP_SHARED)
 1597                 docow |= MAP_INHERIT_SHARE;
 1598         if (writecounted)
 1599                 docow |= MAP_VN_WRITECOUNT;
 1600 
 1601         if (flags & MAP_STACK)
 1602                 rv = vm_map_stack(map, *addr, size, prot, maxprot,
 1603                     docow | MAP_STACK_GROWS_DOWN);
 1604         else if (fitit)
 1605                 rv = vm_map_find(map, object, foff, addr, size,
 1606                     object != NULL && object->type == OBJT_DEVICE ?
 1607                     VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
 1608         else
 1609                 rv = vm_map_fixed(map, object, foff, *addr, size,
 1610                                  prot, maxprot, docow);
 1611 
 1612         if (rv == KERN_SUCCESS) {
 1613                 /*
 1614                  * If the process has requested that all future mappings
 1615                  * be wired, then heed this.
 1616                  */
 1617                 if (map->flags & MAP_WIREFUTURE) {
 1618                         vm_map_wire(map, *addr, *addr + size,
 1619                             VM_MAP_WIRE_USER | ((flags & MAP_STACK) ?
 1620                             VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES));
 1621                 }
 1622         } else {
 1623                 /*
 1624                  * If this mapping was accounted for in the vnode's
 1625                  * writecount, then undo that now.
 1626                  */
 1627                 if (writecounted)
 1628                         vnode_pager_release_writecount(object, 0, size);
 1629                 /*
 1630                  * Lose the object reference.  Will destroy the
 1631                  * object if it's an unnamed anonymous mapping
 1632                  * or named anonymous without other references.
 1633                  */
 1634                 vm_object_deallocate(object);
 1635         }
 1636         return (vm_mmap_to_errno(rv));
 1637 }
 1638 
 1639 /*
 1640  * Translate a Mach VM return code to zero on success or the appropriate errno
 1641  * on failure.
 1642  */
 1643 int
 1644 vm_mmap_to_errno(int rv)
 1645 {
 1646 
 1647         switch (rv) {
 1648         case KERN_SUCCESS:
 1649                 return (0);
 1650         case KERN_INVALID_ADDRESS:
 1651         case KERN_NO_SPACE:
 1652                 return (ENOMEM);
 1653         case KERN_PROTECTION_FAILURE:
 1654                 return (EACCES);
 1655         default:
 1656                 return (EINVAL);
 1657         }
 1658 }

Cache object: efb02e62df23c3cfb92641cec2dd9a41


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.