The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1988 University of Utah.
    3  * Copyright (c) 1991, 1993
    4  *      The Regents of the University of California.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * the Systems Programming Group of the University of Utah Computer
    8  * Science Department.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   39  *
   40  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   41  * $FreeBSD: releng/5.0/sys/vm/vm_mmap.c 107370 2002-11-28 08:01:39Z alc $
   42  */
   43 
   44 /*
   45  * Mapped file (mmap) interface to VM
   46  */
   47 
   48 #include "opt_compat.h"
   49 #include "opt_mac.h"
   50 
   51 #include <sys/param.h>
   52 #include <sys/systm.h>
   53 #include <sys/kernel.h>
   54 #include <sys/lock.h>
   55 #include <sys/mutex.h>
   56 #include <sys/sysproto.h>
   57 #include <sys/filedesc.h>
   58 #include <sys/proc.h>
   59 #include <sys/resource.h>
   60 #include <sys/resourcevar.h>
   61 #include <sys/vnode.h>
   62 #include <sys/fcntl.h>
   63 #include <sys/file.h>
   64 #include <sys/mac.h>
   65 #include <sys/mman.h>
   66 #include <sys/conf.h>
   67 #include <sys/stat.h>
   68 #include <sys/vmmeter.h>
   69 #include <sys/sysctl.h>
   70 
   71 #include <vm/vm.h>
   72 #include <vm/vm_param.h>
   73 #include <vm/pmap.h>
   74 #include <vm/vm_map.h>
   75 #include <vm/vm_object.h>
   76 #include <vm/vm_page.h>
   77 #include <vm/vm_pager.h>
   78 #include <vm/vm_pageout.h>
   79 #include <vm/vm_extern.h>
   80 #include <vm/vm_page.h>
   81 #include <vm/vm_kern.h>
   82 
   83 #ifndef _SYS_SYSPROTO_H_
   84 struct sbrk_args {
   85         int incr;
   86 };
   87 #endif
   88 
   89 static int max_proc_mmap;
   90 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
   91 
   92 /*
   93  * Set the maximum number of vm_map_entry structures per process.  Roughly
   94  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
   95  * of our KVM malloc space still results in generous limits.  We want a 
   96  * default that is good enough to prevent the kernel running out of resources
   97  * if attacked from compromised user account but generous enough such that
   98  * multi-threaded processes are not unduly inconvenienced.
   99  */
  100 static void vmmapentry_rsrc_init(void *);
  101 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
  102 
  103 static void
  104 vmmapentry_rsrc_init(dummy)
  105         void *dummy;
  106 {
  107     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
  108     max_proc_mmap /= 100;
  109 }
  110 
  111 /*
  112  * MPSAFE
  113  */
  114 /* ARGSUSED */
  115 int
  116 sbrk(td, uap)
  117         struct thread *td;
  118         struct sbrk_args *uap;
  119 {
  120         /* Not yet implemented */
  121         /* mtx_lock(&Giant); */
  122         /* mtx_unlock(&Giant); */
  123         return (EOPNOTSUPP);
  124 }
  125 
  126 #ifndef _SYS_SYSPROTO_H_
  127 struct sstk_args {
  128         int incr;
  129 };
  130 #endif
  131 
  132 /*
  133  * MPSAFE
  134  */
  135 /* ARGSUSED */
  136 int
  137 sstk(td, uap)
  138         struct thread *td;
  139         struct sstk_args *uap;
  140 {
  141         /* Not yet implemented */
  142         /* mtx_lock(&Giant); */
  143         /* mtx_unlock(&Giant); */
  144         return (EOPNOTSUPP);
  145 }
  146 
  147 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
  148 #ifndef _SYS_SYSPROTO_H_
  149 struct getpagesize_args {
  150         int dummy;
  151 };
  152 #endif
  153 
  154 /* ARGSUSED */
  155 int
  156 ogetpagesize(td, uap)
  157         struct thread *td;
  158         struct getpagesize_args *uap;
  159 {
  160         /* MP SAFE */
  161         td->td_retval[0] = PAGE_SIZE;
  162         return (0);
  163 }
  164 #endif                          /* COMPAT_43 || COMPAT_SUNOS */
  165 
  166 
  167 /* 
  168  * Memory Map (mmap) system call.  Note that the file offset
  169  * and address are allowed to be NOT page aligned, though if
  170  * the MAP_FIXED flag it set, both must have the same remainder
  171  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  172  * page-aligned, the actual mapping starts at trunc_page(addr)
  173  * and the return value is adjusted up by the page offset.
  174  *
  175  * Generally speaking, only character devices which are themselves
  176  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  177  * there would be no cache coherency between a descriptor and a VM mapping
  178  * both to the same character device.
  179  *
  180  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  181  * is maintained as long as you do not write directly to the underlying
  182  * character device.
  183  */
  184 #ifndef _SYS_SYSPROTO_H_
  185 struct mmap_args {
  186         void *addr;
  187         size_t len;
  188         int prot;
  189         int flags;
  190         int fd;
  191         long pad;
  192         off_t pos;
  193 };
  194 #endif
  195 
  196 /*
  197  * MPSAFE
  198  */
  199 int
  200 mmap(td, uap)
  201         struct thread *td;
  202         struct mmap_args *uap;
  203 {
  204         struct file *fp = NULL;
  205         struct vnode *vp;
  206         vm_offset_t addr;
  207         vm_size_t size, pageoff;
  208         vm_prot_t prot, maxprot;
  209         void *handle;
  210         int flags, error;
  211         int disablexworkaround;
  212         off_t pos;
  213         struct vmspace *vms = td->td_proc->p_vmspace;
  214         vm_object_t obj;
  215 
  216         addr = (vm_offset_t) uap->addr;
  217         size = uap->len;
  218         prot = uap->prot & VM_PROT_ALL;
  219         flags = uap->flags;
  220         pos = uap->pos;
  221 
  222         vp = NULL;
  223         fp = NULL;
  224         /* make sure mapping fits into numeric range etc */
  225         if ((ssize_t) uap->len < 0 ||
  226             ((flags & MAP_ANON) && uap->fd != -1))
  227                 return (EINVAL);
  228 
  229         if (flags & MAP_STACK) {
  230                 if ((uap->fd != -1) ||
  231                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  232                         return (EINVAL);
  233                 flags |= MAP_ANON;
  234                 pos = 0;
  235         }
  236 
  237         /*
  238          * Align the file position to a page boundary,
  239          * and save its page offset component.
  240          */
  241         pageoff = (pos & PAGE_MASK);
  242         pos -= pageoff;
  243 
  244         /* Adjust size for rounding (on both ends). */
  245         size += pageoff;                        /* low end... */
  246         size = (vm_size_t) round_page(size);    /* hi end */
  247 
  248         /*
  249          * Check for illegal addresses.  Watch out for address wrap... Note
  250          * that VM_*_ADDRESS are not constants due to casts (argh).
  251          */
  252         if (flags & MAP_FIXED) {
  253                 /*
  254                  * The specified address must have the same remainder
  255                  * as the file offset taken modulo PAGE_SIZE, so it
  256                  * should be aligned after adjustment by pageoff.
  257                  */
  258                 addr -= pageoff;
  259                 if (addr & PAGE_MASK)
  260                         return (EINVAL);
  261                 /* Address range must be all in user VM space. */
  262                 if (addr < vm_map_min(&vms->vm_map) ||
  263                     addr + size > vm_map_max(&vms->vm_map))
  264                         return (EINVAL);
  265                 if (addr + size < addr)
  266                         return (EINVAL);
  267         }
  268         /*
  269          * XXX for non-fixed mappings where no hint is provided or
  270          * the hint would fall in the potential heap space,
  271          * place it after the end of the largest possible heap.
  272          *
  273          * There should really be a pmap call to determine a reasonable
  274          * location.
  275          */
  276         else if (addr == 0 ||
  277             (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  278              addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
  279                 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
  280 
  281         mtx_lock(&Giant);       /* syscall marked mp-safe but isn't */
  282         if (flags & MAP_ANON) {
  283                 /*
  284                  * Mapping blank space is trivial.
  285                  */
  286                 handle = NULL;
  287                 maxprot = VM_PROT_ALL;
  288                 pos = 0;
  289         } else {
  290                 /*
  291                  * Mapping file, get fp for validation. Obtain vnode and make
  292                  * sure it is of appropriate type.
  293                  * don't let the descriptor disappear on us if we block
  294                  */
  295                 if ((error = fget(td, uap->fd, &fp)) != 0)
  296                         goto done;
  297                 if (fp->f_type != DTYPE_VNODE) {
  298                         error = EINVAL;
  299                         goto done;
  300                 }
  301 
  302                 /*
  303                  * POSIX shared-memory objects are defined to have
  304                  * kernel persistence, and are not defined to support
  305                  * read(2)/write(2) -- or even open(2).  Thus, we can
  306                  * use MAP_ASYNC to trade on-disk coherence for speed.
  307                  * The shm_open(3) library routine turns on the FPOSIXSHM
  308                  * flag to request this behavior.
  309                  */
  310                 if (fp->f_flag & FPOSIXSHM)
  311                         flags |= MAP_NOSYNC;
  312                 vp = (struct vnode *) fp->f_data;
  313                 error = vget(vp, LK_EXCLUSIVE, td);
  314                 if (error)
  315                         goto done;
  316                 if (vp->v_type != VREG && vp->v_type != VCHR) {
  317                         error = EINVAL;
  318                         goto done;
  319                 }
  320                 if (vp->v_type == VREG) {
  321                         /*
  322                          * Get the proper underlying object
  323                          */
  324                         if (VOP_GETVOBJECT(vp, &obj) != 0) {
  325                                 error = EINVAL;
  326                                 goto done;
  327                         }
  328                         if (obj->handle != vp) {
  329                                 vput(vp);
  330                                 vp = (struct vnode*)obj->handle;
  331                                 vget(vp, LK_EXCLUSIVE, td);
  332                         }
  333                 }
  334                 /*
  335                  * XXX hack to handle use of /dev/zero to map anon memory (ala
  336                  * SunOS).
  337                  */
  338                 if ((vp->v_type == VCHR) && 
  339                     (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
  340                         handle = NULL;
  341                         maxprot = VM_PROT_ALL;
  342                         flags |= MAP_ANON;
  343                         pos = 0;
  344                 } else {
  345                         /*
  346                          * cdevs does not provide private mappings of any kind.
  347                          */
  348                         /*
  349                          * However, for XIG X server to continue to work,
  350                          * we should allow the superuser to do it anyway.
  351                          * We only allow it at securelevel < 1.
  352                          * (Because the XIG X server writes directly to video
  353                          * memory via /dev/mem, it should never work at any
  354                          * other securelevel.
  355                          * XXX this will have to go
  356                          */
  357                         if (securelevel_ge(td->td_ucred, 1))
  358                                 disablexworkaround = 1;
  359                         else
  360                                 disablexworkaround = suser(td);
  361                         if (vp->v_type == VCHR && disablexworkaround &&
  362                             (flags & (MAP_PRIVATE|MAP_COPY))) {
  363                                 error = EINVAL;
  364                                 goto done;
  365                         }
  366                         /*
  367                          * Ensure that file and memory protections are
  368                          * compatible.  Note that we only worry about
  369                          * writability if mapping is shared; in this case,
  370                          * current and max prot are dictated by the open file.
  371                          * XXX use the vnode instead?  Problem is: what
  372                          * credentials do we use for determination? What if
  373                          * proc does a setuid?
  374                          */
  375                         maxprot = VM_PROT_EXECUTE;      /* ??? */
  376                         if (fp->f_flag & FREAD) {
  377                                 maxprot |= VM_PROT_READ;
  378                         } else if (prot & PROT_READ) {
  379                                 error = EACCES;
  380                                 goto done;
  381                         }
  382                         /*
  383                          * If we are sharing potential changes (either via
  384                          * MAP_SHARED or via the implicit sharing of character
  385                          * device mappings), and we are trying to get write
  386                          * permission although we opened it without asking
  387                          * for it, bail out.  Check for superuser, only if
  388                          * we're at securelevel < 1, to allow the XIG X server
  389                          * to continue to work.
  390                          */
  391                         if ((flags & MAP_SHARED) != 0 ||
  392                             (vp->v_type == VCHR && disablexworkaround)) {
  393                                 if ((fp->f_flag & FWRITE) != 0) {
  394                                         struct vattr va;
  395                                         if ((error =
  396                                             VOP_GETATTR(vp, &va,
  397                                                         td->td_ucred, td))) {
  398                                                 goto done;
  399                                         }
  400                                         if ((va.va_flags &
  401                                            (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
  402                                                 maxprot |= VM_PROT_WRITE;
  403                                         } else if (prot & PROT_WRITE) {
  404                                                 error = EPERM;
  405                                                 goto done;
  406                                         }
  407                                 } else if ((prot & PROT_WRITE) != 0) {
  408                                         error = EACCES;
  409                                         goto done;
  410                                 }
  411                         } else {
  412                                 maxprot |= VM_PROT_WRITE;
  413                         }
  414 
  415                         handle = (void *)vp;
  416                 }
  417         }
  418 
  419         /*
  420          * Do not allow more then a certain number of vm_map_entry structures
  421          * per process.  Scale with the number of rforks sharing the map
  422          * to make the limit reasonable for threads.
  423          */
  424         if (max_proc_mmap && 
  425             vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
  426                 error = ENOMEM;
  427                 goto done;
  428         }
  429 
  430         mtx_unlock(&Giant);
  431         error = 0;
  432 #ifdef MAC
  433         if (handle != NULL && (flags & MAP_SHARED) != 0) {
  434                 error = mac_check_vnode_mmap(td->td_ucred,
  435                     (struct vnode *)handle, prot);
  436         }
  437 #endif
  438         if (error == 0)
  439                 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  440                     flags, handle, pos);
  441         mtx_lock(&Giant);
  442         if (error == 0)
  443                 td->td_retval[0] = (register_t) (addr + pageoff);
  444 done:
  445         if (vp)
  446                 vput(vp);
  447         mtx_unlock(&Giant);
  448         if (fp)
  449                 fdrop(fp, td);
  450 
  451         return (error);
  452 }
  453 
  454 #ifdef COMPAT_43
  455 #ifndef _SYS_SYSPROTO_H_
  456 struct ommap_args {
  457         caddr_t addr;
  458         int len;
  459         int prot;
  460         int flags;
  461         int fd;
  462         long pos;
  463 };
  464 #endif
  465 int
  466 ommap(td, uap)
  467         struct thread *td;
  468         struct ommap_args *uap;
  469 {
  470         struct mmap_args nargs;
  471         static const char cvtbsdprot[8] = {
  472                 0,
  473                 PROT_EXEC,
  474                 PROT_WRITE,
  475                 PROT_EXEC | PROT_WRITE,
  476                 PROT_READ,
  477                 PROT_EXEC | PROT_READ,
  478                 PROT_WRITE | PROT_READ,
  479                 PROT_EXEC | PROT_WRITE | PROT_READ,
  480         };
  481 
  482 #define OMAP_ANON       0x0002
  483 #define OMAP_COPY       0x0020
  484 #define OMAP_SHARED     0x0010
  485 #define OMAP_FIXED      0x0100
  486 
  487         nargs.addr = uap->addr;
  488         nargs.len = uap->len;
  489         nargs.prot = cvtbsdprot[uap->prot & 0x7];
  490         nargs.flags = 0;
  491         if (uap->flags & OMAP_ANON)
  492                 nargs.flags |= MAP_ANON;
  493         if (uap->flags & OMAP_COPY)
  494                 nargs.flags |= MAP_COPY;
  495         if (uap->flags & OMAP_SHARED)
  496                 nargs.flags |= MAP_SHARED;
  497         else
  498                 nargs.flags |= MAP_PRIVATE;
  499         if (uap->flags & OMAP_FIXED)
  500                 nargs.flags |= MAP_FIXED;
  501         nargs.fd = uap->fd;
  502         nargs.pos = uap->pos;
  503         return (mmap(td, &nargs));
  504 }
  505 #endif                          /* COMPAT_43 */
  506 
  507 
  508 #ifndef _SYS_SYSPROTO_H_
  509 struct msync_args {
  510         void *addr;
  511         int len;
  512         int flags;
  513 };
  514 #endif
  515 /*
  516  * MPSAFE
  517  */
  518 int
  519 msync(td, uap)
  520         struct thread *td;
  521         struct msync_args *uap;
  522 {
  523         vm_offset_t addr;
  524         vm_size_t size, pageoff;
  525         int flags;
  526         vm_map_t map;
  527         int rv;
  528 
  529         addr = (vm_offset_t) uap->addr;
  530         size = uap->len;
  531         flags = uap->flags;
  532 
  533         pageoff = (addr & PAGE_MASK);
  534         addr -= pageoff;
  535         size += pageoff;
  536         size = (vm_size_t) round_page(size);
  537         if (addr + size < addr)
  538                 return (EINVAL);
  539 
  540         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  541                 return (EINVAL);
  542 
  543         mtx_lock(&Giant);
  544 
  545         map = &td->td_proc->p_vmspace->vm_map;
  546 
  547         /*
  548          * XXX Gak!  If size is zero we are supposed to sync "all modified
  549          * pages with the region containing addr".  Unfortunately, we don't
  550          * really keep track of individual mmaps so we approximate by flushing
  551          * the range of the map entry containing addr. This can be incorrect
  552          * if the region splits or is coalesced with a neighbor.
  553          */
  554         if (size == 0) {
  555                 vm_map_entry_t entry;
  556 
  557                 vm_map_lock_read(map);
  558                 rv = vm_map_lookup_entry(map, addr, &entry);
  559                 vm_map_unlock_read(map);
  560                 if (rv == FALSE) {
  561                         rv = -1;
  562                         goto done2;
  563                 }
  564                 addr = entry->start;
  565                 size = entry->end - entry->start;
  566         }
  567 
  568         /*
  569          * Clean the pages and interpret the return value.
  570          */
  571         rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  572             (flags & MS_INVALIDATE) != 0);
  573 
  574 done2:
  575         mtx_unlock(&Giant);
  576 
  577         switch (rv) {
  578         case KERN_SUCCESS:
  579                 return (0);
  580         case KERN_INVALID_ADDRESS:
  581                 return (EINVAL);        /* Sun returns ENOMEM? */
  582         case KERN_FAILURE:
  583                 return (EIO);
  584         default:
  585                 return (EINVAL);
  586         }
  587 }
  588 
  589 #ifndef _SYS_SYSPROTO_H_
  590 struct munmap_args {
  591         void *addr;
  592         size_t len;
  593 };
  594 #endif
  595 /*
  596  * MPSAFE
  597  */
  598 int
  599 munmap(td, uap)
  600         struct thread *td;
  601         struct munmap_args *uap;
  602 {
  603         vm_offset_t addr;
  604         vm_size_t size, pageoff;
  605         vm_map_t map;
  606 
  607         addr = (vm_offset_t) uap->addr;
  608         size = uap->len;
  609 
  610         pageoff = (addr & PAGE_MASK);
  611         addr -= pageoff;
  612         size += pageoff;
  613         size = (vm_size_t) round_page(size);
  614         if (addr + size < addr)
  615                 return (EINVAL);
  616 
  617         if (size == 0)
  618                 return (0);
  619 
  620         /*
  621          * Check for illegal addresses.  Watch out for address wrap...
  622          */
  623         map = &td->td_proc->p_vmspace->vm_map;
  624         if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
  625                 return (EINVAL);
  626         /*
  627          * Make sure entire range is allocated.
  628          */
  629         if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE))
  630                 return (EINVAL);
  631 
  632         /* returns nothing but KERN_SUCCESS anyway */
  633         (void) vm_map_remove(map, addr, addr + size);
  634         return (0);
  635 }
  636 
  637 #if 0
  638 void
  639 munmapfd(td, fd)
  640         struct thread *td;
  641         int fd;
  642 {
  643         /*
  644          * XXX should unmap any regions mapped to this file
  645          */
  646         FILEDESC_LOCK(p->p_fd);
  647         td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
  648         FILEDESC_UNLOCK(p->p_fd);
  649 }
  650 #endif
  651 
  652 #ifndef _SYS_SYSPROTO_H_
  653 struct mprotect_args {
  654         const void *addr;
  655         size_t len;
  656         int prot;
  657 };
  658 #endif
  659 /*
  660  * MPSAFE
  661  */
  662 int
  663 mprotect(td, uap)
  664         struct thread *td;
  665         struct mprotect_args *uap;
  666 {
  667         vm_offset_t addr;
  668         vm_size_t size, pageoff;
  669         vm_prot_t prot;
  670 
  671         addr = (vm_offset_t) uap->addr;
  672         size = uap->len;
  673         prot = uap->prot & VM_PROT_ALL;
  674 #if defined(VM_PROT_READ_IS_EXEC)
  675         if (prot & VM_PROT_READ)
  676                 prot |= VM_PROT_EXECUTE;
  677 #endif
  678 
  679         pageoff = (addr & PAGE_MASK);
  680         addr -= pageoff;
  681         size += pageoff;
  682         size = (vm_size_t) round_page(size);
  683         if (addr + size < addr)
  684                 return (EINVAL);
  685 
  686         switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
  687             addr + size, prot, FALSE)) {
  688         case KERN_SUCCESS:
  689                 return (0);
  690         case KERN_PROTECTION_FAILURE:
  691                 return (EACCES);
  692         }
  693         return (EINVAL);
  694 }
  695 
  696 #ifndef _SYS_SYSPROTO_H_
  697 struct minherit_args {
  698         void *addr;
  699         size_t len;
  700         int inherit;
  701 };
  702 #endif
  703 /*
  704  * MPSAFE
  705  */
  706 int
  707 minherit(td, uap)
  708         struct thread *td;
  709         struct minherit_args *uap;
  710 {
  711         vm_offset_t addr;
  712         vm_size_t size, pageoff;
  713         vm_inherit_t inherit;
  714 
  715         addr = (vm_offset_t)uap->addr;
  716         size = uap->len;
  717         inherit = uap->inherit;
  718 
  719         pageoff = (addr & PAGE_MASK);
  720         addr -= pageoff;
  721         size += pageoff;
  722         size = (vm_size_t) round_page(size);
  723         if (addr + size < addr)
  724                 return (EINVAL);
  725 
  726         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  727             addr + size, inherit)) {
  728         case KERN_SUCCESS:
  729                 return (0);
  730         case KERN_PROTECTION_FAILURE:
  731                 return (EACCES);
  732         }
  733         return (EINVAL);
  734 }
  735 
  736 #ifndef _SYS_SYSPROTO_H_
  737 struct madvise_args {
  738         void *addr;
  739         size_t len;
  740         int behav;
  741 };
  742 #endif
  743 
  744 /*
  745  * MPSAFE
  746  */
  747 /* ARGSUSED */
  748 int
  749 madvise(td, uap)
  750         struct thread *td;
  751         struct madvise_args *uap;
  752 {
  753         vm_offset_t start, end;
  754         vm_map_t map;
  755 
  756         /*
  757          * Check for illegal behavior
  758          */
  759         if (uap->behav < 0 || uap->behav > MADV_CORE)
  760                 return (EINVAL);
  761         /*
  762          * Check for illegal addresses.  Watch out for address wrap... Note
  763          * that VM_*_ADDRESS are not constants due to casts (argh).
  764          */
  765         map = &td->td_proc->p_vmspace->vm_map;
  766         if ((vm_offset_t)uap->addr < vm_map_min(map) ||
  767             (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
  768                 return (EINVAL);
  769         if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
  770                 return (EINVAL);
  771 
  772         /*
  773          * Since this routine is only advisory, we default to conservative
  774          * behavior.
  775          */
  776         start = trunc_page((vm_offset_t) uap->addr);
  777         end = round_page((vm_offset_t) uap->addr + uap->len);
  778         
  779         if (vm_map_madvise(map, start, end, uap->behav))
  780                 return (EINVAL);
  781         return (0);
  782 }
  783 
  784 #ifndef _SYS_SYSPROTO_H_
  785 struct mincore_args {
  786         const void *addr;
  787         size_t len;
  788         char *vec;
  789 };
  790 #endif
  791 
  792 /*
  793  * MPSAFE
  794  */
  795 /* ARGSUSED */
  796 int
  797 mincore(td, uap)
  798         struct thread *td;
  799         struct mincore_args *uap;
  800 {
  801         vm_offset_t addr, first_addr;
  802         vm_offset_t end, cend;
  803         pmap_t pmap;
  804         vm_map_t map;
  805         char *vec;
  806         int error = 0;
  807         int vecindex, lastvecindex;
  808         vm_map_entry_t current;
  809         vm_map_entry_t entry;
  810         int mincoreinfo;
  811         unsigned int timestamp;
  812 
  813         /*
  814          * Make sure that the addresses presented are valid for user
  815          * mode.
  816          */
  817         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  818         end = addr + (vm_size_t)round_page(uap->len);
  819         map = &td->td_proc->p_vmspace->vm_map;
  820         if (end > vm_map_max(map) || end < addr)
  821                 return (EINVAL);
  822 
  823         /*
  824          * Address of byte vector
  825          */
  826         vec = uap->vec;
  827 
  828         mtx_lock(&Giant);
  829         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  830 
  831         vm_map_lock_read(map);
  832 RestartScan:
  833         timestamp = map->timestamp;
  834 
  835         if (!vm_map_lookup_entry(map, addr, &entry))
  836                 entry = entry->next;
  837 
  838         /*
  839          * Do this on a map entry basis so that if the pages are not
  840          * in the current processes address space, we can easily look
  841          * up the pages elsewhere.
  842          */
  843         lastvecindex = -1;
  844         for (current = entry;
  845             (current != &map->header) && (current->start < end);
  846             current = current->next) {
  847 
  848                 /*
  849                  * ignore submaps (for now) or null objects
  850                  */
  851                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  852                         current->object.vm_object == NULL)
  853                         continue;
  854                 
  855                 /*
  856                  * limit this scan to the current map entry and the
  857                  * limits for the mincore call
  858                  */
  859                 if (addr < current->start)
  860                         addr = current->start;
  861                 cend = current->end;
  862                 if (cend > end)
  863                         cend = end;
  864 
  865                 /*
  866                  * scan this entry one page at a time
  867                  */
  868                 while (addr < cend) {
  869                         /*
  870                          * Check pmap first, it is likely faster, also
  871                          * it can provide info as to whether we are the
  872                          * one referencing or modifying the page.
  873                          */
  874                         mincoreinfo = pmap_mincore(pmap, addr);
  875                         if (!mincoreinfo) {
  876                                 vm_pindex_t pindex;
  877                                 vm_ooffset_t offset;
  878                                 vm_page_t m;
  879                                 /*
  880                                  * calculate the page index into the object
  881                                  */
  882                                 offset = current->offset + (addr - current->start);
  883                                 pindex = OFF_TO_IDX(offset);
  884                                 m = vm_page_lookup(current->object.vm_object,
  885                                         pindex);
  886                                 vm_page_lock_queues();
  887                                 /*
  888                                  * if the page is resident, then gather information about
  889                                  * it.
  890                                  */
  891                                 if (m) {
  892                                         mincoreinfo = MINCORE_INCORE;
  893                                         if (m->dirty ||
  894                                                 pmap_is_modified(m))
  895                                                 mincoreinfo |= MINCORE_MODIFIED_OTHER;
  896                                         if ((m->flags & PG_REFERENCED) ||
  897                                                 pmap_ts_referenced(m)) {
  898                                                 vm_page_flag_set(m, PG_REFERENCED);
  899                                                 mincoreinfo |= MINCORE_REFERENCED_OTHER;
  900                                         }
  901                                 }
  902                                 vm_page_unlock_queues();
  903                         }
  904 
  905                         /*
  906                          * subyte may page fault.  In case it needs to modify
  907                          * the map, we release the lock.
  908                          */
  909                         vm_map_unlock_read(map);
  910 
  911                         /*
  912                          * calculate index into user supplied byte vector
  913                          */
  914                         vecindex = OFF_TO_IDX(addr - first_addr);
  915 
  916                         /*
  917                          * If we have skipped map entries, we need to make sure that
  918                          * the byte vector is zeroed for those skipped entries.
  919                          */
  920                         while ((lastvecindex + 1) < vecindex) {
  921                                 error = subyte(vec + lastvecindex, 0);
  922                                 if (error) {
  923                                         error = EFAULT;
  924                                         goto done2;
  925                                 }
  926                                 ++lastvecindex;
  927                         }
  928 
  929                         /*
  930                          * Pass the page information to the user
  931                          */
  932                         error = subyte(vec + vecindex, mincoreinfo);
  933                         if (error) {
  934                                 error = EFAULT;
  935                                 goto done2;
  936                         }
  937 
  938                         /*
  939                          * If the map has changed, due to the subyte, the previous
  940                          * output may be invalid.
  941                          */
  942                         vm_map_lock_read(map);
  943                         if (timestamp != map->timestamp)
  944                                 goto RestartScan;
  945 
  946                         lastvecindex = vecindex;
  947                         addr += PAGE_SIZE;
  948                 }
  949         }
  950 
  951         /*
  952          * subyte may page fault.  In case it needs to modify
  953          * the map, we release the lock.
  954          */
  955         vm_map_unlock_read(map);
  956 
  957         /*
  958          * Zero the last entries in the byte vector.
  959          */
  960         vecindex = OFF_TO_IDX(end - first_addr);
  961         while ((lastvecindex + 1) < vecindex) {
  962                 error = subyte(vec + lastvecindex, 0);
  963                 if (error) {
  964                         error = EFAULT;
  965                         goto done2;
  966                 }
  967                 ++lastvecindex;
  968         }
  969         
  970         /*
  971          * If the map has changed, due to the subyte, the previous
  972          * output may be invalid.
  973          */
  974         vm_map_lock_read(map);
  975         if (timestamp != map->timestamp)
  976                 goto RestartScan;
  977         vm_map_unlock_read(map);
  978 done2:
  979         mtx_unlock(&Giant);
  980         return (error);
  981 }
  982 
  983 #ifndef _SYS_SYSPROTO_H_
  984 struct mlock_args {
  985         const void *addr;
  986         size_t len;
  987 };
  988 #endif
  989 /*
  990  * MPSAFE
  991  */
  992 int
  993 mlock(td, uap)
  994         struct thread *td;
  995         struct mlock_args *uap;
  996 {
  997         vm_offset_t addr;
  998         vm_size_t size, pageoff;
  999         int error;
 1000 
 1001         addr = (vm_offset_t) uap->addr;
 1002         size = uap->len;
 1003 
 1004         pageoff = (addr & PAGE_MASK);
 1005         addr -= pageoff;
 1006         size += pageoff;
 1007         size = (vm_size_t) round_page(size);
 1008 
 1009         /* disable wrap around */
 1010         if (addr + size < addr)
 1011                 return (EINVAL);
 1012 
 1013         if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
 1014                 return (EAGAIN);
 1015 
 1016 #ifdef pmap_wired_count
 1017         if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
 1018             td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
 1019                 return (ENOMEM);
 1020 #else
 1021         error = suser(td);
 1022         if (error)
 1023                 return (error);
 1024 #endif
 1025 
 1026         error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
 1027                      addr + size, TRUE);
 1028         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1029 }
 1030 
 1031 #ifndef _SYS_SYSPROTO_H_
 1032 struct mlockall_args {
 1033         int     how;
 1034 };
 1035 #endif
 1036 
 1037 /*
 1038  * MPSAFE
 1039  */
 1040 int
 1041 mlockall(td, uap)
 1042         struct thread *td;
 1043         struct mlockall_args *uap;
 1044 {
 1045         /* mtx_lock(&Giant); */
 1046         /* mtx_unlock(&Giant); */
 1047         return 0;
 1048 }
 1049 
 1050 #ifndef _SYS_SYSPROTO_H_
 1051 struct munlockall_args {
 1052         int     how;
 1053 };
 1054 #endif
 1055 
 1056 /*
 1057  * MPSAFE
 1058  */
 1059 int
 1060 munlockall(td, uap)
 1061         struct thread *td;
 1062         struct munlockall_args *uap;
 1063 {
 1064         /* mtx_lock(&Giant); */
 1065         /* mtx_unlock(&Giant); */
 1066         return 0;
 1067 }
 1068 
 1069 #ifndef _SYS_SYSPROTO_H_
 1070 struct munlock_args {
 1071         const void *addr;
 1072         size_t len;
 1073 };
 1074 #endif
 1075 /*
 1076  * MPSAFE
 1077  */
 1078 int
 1079 munlock(td, uap)
 1080         struct thread *td;
 1081         struct munlock_args *uap;
 1082 {
 1083         vm_offset_t addr;
 1084         vm_size_t size, pageoff;
 1085         int error;
 1086 
 1087         addr = (vm_offset_t) uap->addr;
 1088         size = uap->len;
 1089 
 1090         pageoff = (addr & PAGE_MASK);
 1091         addr -= pageoff;
 1092         size += pageoff;
 1093         size = (vm_size_t) round_page(size);
 1094 
 1095         /* disable wrap around */
 1096         if (addr + size < addr)
 1097                 return (EINVAL);
 1098 
 1099 #ifndef pmap_wired_count
 1100         error = suser(td);
 1101         if (error)
 1102                 return (error);
 1103 #endif
 1104 
 1105         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
 1106                      addr + size, TRUE);
 1107         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1108 }
 1109 
 1110 /*
 1111  * vm_mmap()
 1112  *
 1113  * MPSAFE
 1114  *
 1115  * Internal version of mmap.  Currently used by mmap, exec, and sys5
 1116  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
 1117  */
 1118 int
 1119 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1120         vm_prot_t maxprot, int flags,
 1121         void *handle,
 1122         vm_ooffset_t foff)
 1123 {
 1124         boolean_t fitit;
 1125         vm_object_t object;
 1126         struct vnode *vp = NULL;
 1127         objtype_t type;
 1128         int rv = KERN_SUCCESS;
 1129         vm_ooffset_t objsize;
 1130         int docow;
 1131         struct thread *td = curthread;
 1132 
 1133         if (size == 0)
 1134                 return (0);
 1135 
 1136         objsize = size = round_page(size);
 1137 
 1138         if (td->td_proc->p_vmspace->vm_map.size + size >
 1139             td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
 1140                 return(ENOMEM);
 1141         }
 1142 
 1143         /*
 1144          * We currently can only deal with page aligned file offsets.
 1145          * The check is here rather than in the syscall because the
 1146          * kernel calls this function internally for other mmaping
 1147          * operations (such as in exec) and non-aligned offsets will
 1148          * cause pmap inconsistencies...so we want to be sure to
 1149          * disallow this in all cases.
 1150          */
 1151         if (foff & PAGE_MASK)
 1152                 return (EINVAL);
 1153 
 1154         if ((flags & MAP_FIXED) == 0) {
 1155                 fitit = TRUE;
 1156                 *addr = round_page(*addr);
 1157         } else {
 1158                 if (*addr != trunc_page(*addr))
 1159                         return (EINVAL);
 1160                 fitit = FALSE;
 1161                 (void) vm_map_remove(map, *addr, *addr + size);
 1162         }
 1163 
 1164         /*
 1165          * Lookup/allocate object.
 1166          */
 1167         if (flags & MAP_ANON) {
 1168                 type = OBJT_DEFAULT;
 1169                 /*
 1170                  * Unnamed anonymous regions always start at 0.
 1171                  */
 1172                 if (handle == 0)
 1173                         foff = 0;
 1174         } else {
 1175                 vp = (struct vnode *) handle;
 1176                 mtx_lock(&Giant);
 1177                 ASSERT_VOP_LOCKED(vp, "vm_mmap");
 1178                 if (vp->v_type == VCHR) {
 1179                         type = OBJT_DEVICE;
 1180                         handle = (void *)(intptr_t)vp->v_rdev;
 1181                 } else {
 1182                         struct vattr vat;
 1183                         int error;
 1184 
 1185                         error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
 1186                         if (error) {
 1187                                 mtx_unlock(&Giant);
 1188                                 return (error);
 1189                         }
 1190                         objsize = round_page(vat.va_size);
 1191                         type = OBJT_VNODE;
 1192                         /*
 1193                          * if it is a regular file without any references
 1194                          * we do not need to sync it.
 1195                          */
 1196                         if (vp->v_type == VREG && vat.va_nlink == 0) {
 1197                                 flags |= MAP_NOSYNC;
 1198                         }
 1199                 }
 1200                 mtx_unlock(&Giant);
 1201         }
 1202 
 1203         if (handle == NULL) {
 1204                 object = NULL;
 1205                 docow = 0;
 1206         } else {
 1207                 object = vm_pager_allocate(type,
 1208                         handle, objsize, prot, foff);
 1209                 if (object == NULL) {
 1210                         return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 1211                 }
 1212                 docow = MAP_PREFAULT_PARTIAL;
 1213         }
 1214 
 1215         /*
 1216          * Force device mappings to be shared.
 1217          */
 1218         if (type == OBJT_DEVICE || type == OBJT_PHYS) {
 1219                 flags &= ~(MAP_PRIVATE|MAP_COPY);
 1220                 flags |= MAP_SHARED;
 1221         }
 1222 
 1223         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1224                 docow |= MAP_COPY_ON_WRITE;
 1225         if (flags & MAP_NOSYNC)
 1226                 docow |= MAP_DISABLE_SYNCER;
 1227         if (flags & MAP_NOCORE)
 1228                 docow |= MAP_DISABLE_COREDUMP;
 1229 
 1230 #if defined(VM_PROT_READ_IS_EXEC)
 1231         if (prot & VM_PROT_READ)
 1232                 prot |= VM_PROT_EXECUTE;
 1233 
 1234         if (maxprot & VM_PROT_READ)
 1235                 maxprot |= VM_PROT_EXECUTE;
 1236 #endif
 1237 
 1238         if (fitit)
 1239                 *addr = pmap_addr_hint(object, *addr, size);
 1240 
 1241         if (flags & MAP_STACK)
 1242                 rv = vm_map_stack (map, *addr, size, prot,
 1243                                    maxprot, docow);
 1244         else
 1245                 rv = vm_map_find(map, object, foff, addr, size, fitit,
 1246                                  prot, maxprot, docow);
 1247 
 1248         if (rv != KERN_SUCCESS) {
 1249                 /*
 1250                  * Lose the object reference. Will destroy the
 1251                  * object if it's an unnamed anonymous mapping
 1252                  * or named anonymous without other references.
 1253                  */
 1254                 vm_object_deallocate(object);
 1255         } else if (flags & MAP_SHARED) {
 1256                 /*
 1257                  * Shared memory is also shared with children.
 1258                  */
 1259                 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 1260                 if (rv != KERN_SUCCESS)
 1261                         (void) vm_map_remove(map, *addr, *addr + size);
 1262         }
 1263         switch (rv) {
 1264         case KERN_SUCCESS:
 1265                 return (0);
 1266         case KERN_INVALID_ADDRESS:
 1267         case KERN_NO_SPACE:
 1268                 return (ENOMEM);
 1269         case KERN_PROTECTION_FAILURE:
 1270                 return (EACCES);
 1271         default:
 1272                 return (EINVAL);
 1273         }
 1274 }

Cache object: 1b8dc701829d9f10f88fa5bc36d509ac


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.