The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 1988 University of Utah.
    3  * Copyright (c) 1991, 1993
    4  *      The Regents of the University of California.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * the Systems Programming Group of the University of Utah Computer
    8  * Science Department.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 3. All advertising materials mentioning features or use of this software
   19  *    must display the following acknowledgement:
   20  *      This product includes software developed by the University of
   21  *      California, Berkeley and its contributors.
   22  * 4. Neither the name of the University nor the names of its contributors
   23  *    may be used to endorse or promote products derived from this software
   24  *    without specific prior written permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   36  * SUCH DAMAGE.
   37  *
   38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   39  *
   40  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   41  */
   42 
   43 /*
   44  * Mapped file (mmap) interface to VM
   45  */
   46 
   47 #include <sys/cdefs.h>
   48 __FBSDID("$FreeBSD: releng/5.2/sys/vm/vm_mmap.c 123469 2003-12-11 20:30:15Z kan $");
   49 
   50 #include "opt_compat.h"
   51 #include "opt_mac.h"
   52 
   53 #include <sys/param.h>
   54 #include <sys/systm.h>
   55 #include <sys/kernel.h>
   56 #include <sys/lock.h>
   57 #include <sys/mutex.h>
   58 #include <sys/sysproto.h>
   59 #include <sys/filedesc.h>
   60 #include <sys/proc.h>
   61 #include <sys/resource.h>
   62 #include <sys/resourcevar.h>
   63 #include <sys/vnode.h>
   64 #include <sys/fcntl.h>
   65 #include <sys/file.h>
   66 #include <sys/mac.h>
   67 #include <sys/mman.h>
   68 #include <sys/conf.h>
   69 #include <sys/stat.h>
   70 #include <sys/vmmeter.h>
   71 #include <sys/sysctl.h>
   72 
   73 #include <vm/vm.h>
   74 #include <vm/vm_param.h>
   75 #include <vm/pmap.h>
   76 #include <vm/vm_map.h>
   77 #include <vm/vm_object.h>
   78 #include <vm/vm_page.h>
   79 #include <vm/vm_pager.h>
   80 #include <vm/vm_pageout.h>
   81 #include <vm/vm_extern.h>
   82 #include <vm/vm_page.h>
   83 #include <vm/vm_kern.h>
   84 
   85 #ifndef _SYS_SYSPROTO_H_
   86 struct sbrk_args {
   87         int incr;
   88 };
   89 #endif
   90 
   91 static int max_proc_mmap;
   92 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
   93 
   94 /*
   95  * Set the maximum number of vm_map_entry structures per process.  Roughly
   96  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
   97  * of our KVM malloc space still results in generous limits.  We want a 
   98  * default that is good enough to prevent the kernel running out of resources
   99  * if attacked from compromised user account but generous enough such that
  100  * multi-threaded processes are not unduly inconvenienced.
  101  */
  102 static void vmmapentry_rsrc_init(void *);
  103 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
  104 
  105 static void
  106 vmmapentry_rsrc_init(dummy)
  107         void *dummy;
  108 {
  109     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
  110     max_proc_mmap /= 100;
  111 }
  112 
  113 /*
  114  * MPSAFE
  115  */
  116 /* ARGSUSED */
  117 int
  118 sbrk(td, uap)
  119         struct thread *td;
  120         struct sbrk_args *uap;
  121 {
  122         /* Not yet implemented */
  123         /* mtx_lock(&Giant); */
  124         /* mtx_unlock(&Giant); */
  125         return (EOPNOTSUPP);
  126 }
  127 
  128 #ifndef _SYS_SYSPROTO_H_
  129 struct sstk_args {
  130         int incr;
  131 };
  132 #endif
  133 
  134 /*
  135  * MPSAFE
  136  */
  137 /* ARGSUSED */
  138 int
  139 sstk(td, uap)
  140         struct thread *td;
  141         struct sstk_args *uap;
  142 {
  143         /* Not yet implemented */
  144         /* mtx_lock(&Giant); */
  145         /* mtx_unlock(&Giant); */
  146         return (EOPNOTSUPP);
  147 }
  148 
  149 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
  150 #ifndef _SYS_SYSPROTO_H_
  151 struct getpagesize_args {
  152         int dummy;
  153 };
  154 #endif
  155 
  156 /* ARGSUSED */
  157 int
  158 ogetpagesize(td, uap)
  159         struct thread *td;
  160         struct getpagesize_args *uap;
  161 {
  162         /* MP SAFE */
  163         td->td_retval[0] = PAGE_SIZE;
  164         return (0);
  165 }
  166 #endif                          /* COMPAT_43 || COMPAT_SUNOS */
  167 
  168 
  169 /* 
  170  * Memory Map (mmap) system call.  Note that the file offset
  171  * and address are allowed to be NOT page aligned, though if
  172  * the MAP_FIXED flag it set, both must have the same remainder
  173  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  174  * page-aligned, the actual mapping starts at trunc_page(addr)
  175  * and the return value is adjusted up by the page offset.
  176  *
  177  * Generally speaking, only character devices which are themselves
  178  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  179  * there would be no cache coherency between a descriptor and a VM mapping
  180  * both to the same character device.
  181  *
  182  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  183  * is maintained as long as you do not write directly to the underlying
  184  * character device.
  185  */
  186 #ifndef _SYS_SYSPROTO_H_
  187 struct mmap_args {
  188         void *addr;
  189         size_t len;
  190         int prot;
  191         int flags;
  192         int fd;
  193         long pad;
  194         off_t pos;
  195 };
  196 #endif
  197 
  198 /*
  199  * MPSAFE
  200  */
  201 int
  202 mmap(td, uap)
  203         struct thread *td;
  204         struct mmap_args *uap;
  205 {
  206         struct file *fp = NULL;
  207         struct vnode *vp;
  208         vm_offset_t addr;
  209         vm_size_t size, pageoff;
  210         vm_prot_t prot, maxprot;
  211         void *handle;
  212         int flags, error;
  213         int disablexworkaround;
  214         off_t pos;
  215         struct vmspace *vms = td->td_proc->p_vmspace;
  216         vm_object_t obj;
  217 
  218         addr = (vm_offset_t) uap->addr;
  219         size = uap->len;
  220         prot = uap->prot & VM_PROT_ALL;
  221         flags = uap->flags;
  222         pos = uap->pos;
  223 
  224         vp = NULL;
  225         fp = NULL;
  226         /* make sure mapping fits into numeric range etc */
  227         if ((ssize_t) uap->len < 0 ||
  228             ((flags & MAP_ANON) && uap->fd != -1))
  229                 return (EINVAL);
  230 
  231         if (flags & MAP_STACK) {
  232                 if ((uap->fd != -1) ||
  233                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  234                         return (EINVAL);
  235                 flags |= MAP_ANON;
  236                 pos = 0;
  237         }
  238 
  239         /*
  240          * Align the file position to a page boundary,
  241          * and save its page offset component.
  242          */
  243         pageoff = (pos & PAGE_MASK);
  244         pos -= pageoff;
  245 
  246         /* Adjust size for rounding (on both ends). */
  247         size += pageoff;                        /* low end... */
  248         size = (vm_size_t) round_page(size);    /* hi end */
  249 
  250         /*
  251          * Check for illegal addresses.  Watch out for address wrap... Note
  252          * that VM_*_ADDRESS are not constants due to casts (argh).
  253          */
  254         if (flags & MAP_FIXED) {
  255                 /*
  256                  * The specified address must have the same remainder
  257                  * as the file offset taken modulo PAGE_SIZE, so it
  258                  * should be aligned after adjustment by pageoff.
  259                  */
  260                 addr -= pageoff;
  261                 if (addr & PAGE_MASK)
  262                         return (EINVAL);
  263                 /* Address range must be all in user VM space. */
  264                 if (addr < vm_map_min(&vms->vm_map) ||
  265                     addr + size > vm_map_max(&vms->vm_map))
  266                         return (EINVAL);
  267                 if (addr + size < addr)
  268                         return (EINVAL);
  269         }
  270         /*
  271          * XXX for non-fixed mappings where no hint is provided or
  272          * the hint would fall in the potential heap space,
  273          * place it after the end of the largest possible heap.
  274          *
  275          * There should really be a pmap call to determine a reasonable
  276          * location.
  277          */
  278         else if (addr == 0 ||
  279             (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  280              addr < round_page((vm_offset_t)vms->vm_daddr +
  281               td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max)))
  282                 addr = round_page((vm_offset_t)vms->vm_daddr +
  283                     td->td_proc->p_rlimit[RLIMIT_DATA].rlim_max);
  284 
  285         mtx_lock(&Giant);       /* syscall marked mp-safe but isn't */
  286         do {
  287                 if (flags & MAP_ANON) {
  288                         /*
  289                          * Mapping blank space is trivial.
  290                          */
  291                         handle = NULL;
  292                         maxprot = VM_PROT_ALL;
  293                         pos = 0;
  294                         break;
  295                 }
  296                 /*
  297                  * Mapping file, get fp for validation. Obtain vnode and make
  298                  * sure it is of appropriate type.
  299                  * don't let the descriptor disappear on us if we block
  300                  */
  301                 if ((error = fget(td, uap->fd, &fp)) != 0)
  302                         goto done;
  303                 if (fp->f_type != DTYPE_VNODE) {
  304                         error = EINVAL;
  305                         goto done;
  306                 }
  307 
  308                 /*
  309                  * POSIX shared-memory objects are defined to have
  310                  * kernel persistence, and are not defined to support
  311                  * read(2)/write(2) -- or even open(2).  Thus, we can
  312                  * use MAP_ASYNC to trade on-disk coherence for speed.
  313                  * The shm_open(3) library routine turns on the FPOSIXSHM
  314                  * flag to request this behavior.
  315                  */
  316                 if (fp->f_flag & FPOSIXSHM)
  317                         flags |= MAP_NOSYNC;
  318                 vp = fp->f_vnode;
  319                 error = vget(vp, LK_EXCLUSIVE, td);
  320                 if (error)
  321                         goto done;
  322                 if (vp->v_type != VREG && vp->v_type != VCHR) {
  323                         error = EINVAL;
  324                         goto done;
  325                 }
  326                 if (vp->v_type == VREG) {
  327                         /*
  328                          * Get the proper underlying object
  329                          */
  330                         if (VOP_GETVOBJECT(vp, &obj) != 0) {
  331                                 error = EINVAL;
  332                                 goto done;
  333                         }
  334                         if (obj->handle != vp) {
  335                                 vput(vp);
  336                                 vp = (struct vnode*)obj->handle;
  337                                 vget(vp, LK_EXCLUSIVE, td);
  338                         }
  339                 }
  340                 /*
  341                  * XXX hack to handle use of /dev/zero to map anon memory (ala
  342                  * SunOS).
  343                  */
  344                 if ((vp->v_type == VCHR) && 
  345                     (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
  346                         handle = NULL;
  347                         maxprot = VM_PROT_ALL;
  348                         flags |= MAP_ANON;
  349                         pos = 0;
  350                         break;
  351                 }
  352                 /*
  353                  * cdevs does not provide private mappings of any kind.
  354                  */
  355                 /*
  356                  * However, for XIG X server to continue to work,
  357                  * we should allow the superuser to do it anyway.
  358                  * We only allow it at securelevel < 1.
  359                  * (Because the XIG X server writes directly to video
  360                  * memory via /dev/mem, it should never work at any
  361                  * other securelevel.
  362                  * XXX this will have to go
  363                  */
  364                 if (securelevel_ge(td->td_ucred, 1))
  365                         disablexworkaround = 1;
  366                 else
  367                         disablexworkaround = suser(td);
  368                 if (vp->v_type == VCHR && disablexworkaround &&
  369                     (flags & (MAP_PRIVATE|MAP_COPY))) {
  370                         error = EINVAL;
  371                         goto done;
  372                 }
  373                 /*
  374                  * Ensure that file and memory protections are
  375                  * compatible.  Note that we only worry about
  376                  * writability if mapping is shared; in this case,
  377                  * current and max prot are dictated by the open file.
  378                  * XXX use the vnode instead?  Problem is: what
  379                  * credentials do we use for determination? What if
  380                  * proc does a setuid?
  381                  */
  382                 maxprot = VM_PROT_EXECUTE;      /* ??? */
  383                 if (fp->f_flag & FREAD) {
  384                         maxprot |= VM_PROT_READ;
  385                 } else if (prot & PROT_READ) {
  386                         error = EACCES;
  387                         goto done;
  388                 }
  389                 /*
  390                  * If we are sharing potential changes (either via
  391                  * MAP_SHARED or via the implicit sharing of character
  392                  * device mappings), and we are trying to get write
  393                  * permission although we opened it without asking
  394                  * for it, bail out.  Check for superuser, only if
  395                  * we're at securelevel < 1, to allow the XIG X server
  396                  * to continue to work.
  397                  */
  398                 if ((flags & MAP_SHARED) != 0 ||
  399                     (vp->v_type == VCHR && disablexworkaround)) {
  400                         if ((fp->f_flag & FWRITE) != 0) {
  401                                 struct vattr va;
  402                                 if ((error =
  403                                     VOP_GETATTR(vp, &va,
  404                                                 td->td_ucred, td))) {
  405                                         goto done;
  406                                 }
  407                                 if ((va.va_flags &
  408                                    (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
  409                                         maxprot |= VM_PROT_WRITE;
  410                                 } else if (prot & PROT_WRITE) {
  411                                         error = EPERM;
  412                                         goto done;
  413                                 }
  414                         } else if ((prot & PROT_WRITE) != 0) {
  415                                 error = EACCES;
  416                                 goto done;
  417                         }
  418                 } else {
  419                         maxprot |= VM_PROT_WRITE;
  420                 }
  421 
  422                 handle = (void *)vp;
  423         } while (0);
  424 
  425         /*
  426          * Do not allow more then a certain number of vm_map_entry structures
  427          * per process.  Scale with the number of rforks sharing the map
  428          * to make the limit reasonable for threads.
  429          */
  430         if (max_proc_mmap && 
  431             vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
  432                 error = ENOMEM;
  433                 goto done;
  434         }
  435 
  436         error = 0;
  437 #ifdef MAC
  438         if (handle != NULL && (flags & MAP_SHARED) != 0) {
  439                 error = mac_check_vnode_mmap(td->td_ucred,
  440                     (struct vnode *)handle, prot);
  441         }
  442 #endif
  443         if (vp != NULL) {
  444                 vput(vp);
  445                 vp = NULL;
  446         }
  447         mtx_unlock(&Giant);
  448         if (error == 0)
  449                 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  450                     flags, handle, pos);
  451         mtx_lock(&Giant);
  452         if (error == 0)
  453                 td->td_retval[0] = (register_t) (addr + pageoff);
  454 done:
  455         if (vp)
  456                 vput(vp);
  457         mtx_unlock(&Giant);
  458         if (fp)
  459                 fdrop(fp, td);
  460 
  461         return (error);
  462 }
  463 
  464 #ifdef COMPAT_43
  465 #ifndef _SYS_SYSPROTO_H_
  466 struct ommap_args {
  467         caddr_t addr;
  468         int len;
  469         int prot;
  470         int flags;
  471         int fd;
  472         long pos;
  473 };
  474 #endif
  475 int
  476 ommap(td, uap)
  477         struct thread *td;
  478         struct ommap_args *uap;
  479 {
  480         struct mmap_args nargs;
  481         static const char cvtbsdprot[8] = {
  482                 0,
  483                 PROT_EXEC,
  484                 PROT_WRITE,
  485                 PROT_EXEC | PROT_WRITE,
  486                 PROT_READ,
  487                 PROT_EXEC | PROT_READ,
  488                 PROT_WRITE | PROT_READ,
  489                 PROT_EXEC | PROT_WRITE | PROT_READ,
  490         };
  491 
  492 #define OMAP_ANON       0x0002
  493 #define OMAP_COPY       0x0020
  494 #define OMAP_SHARED     0x0010
  495 #define OMAP_FIXED      0x0100
  496 
  497         nargs.addr = uap->addr;
  498         nargs.len = uap->len;
  499         nargs.prot = cvtbsdprot[uap->prot & 0x7];
  500         nargs.flags = 0;
  501         if (uap->flags & OMAP_ANON)
  502                 nargs.flags |= MAP_ANON;
  503         if (uap->flags & OMAP_COPY)
  504                 nargs.flags |= MAP_COPY;
  505         if (uap->flags & OMAP_SHARED)
  506                 nargs.flags |= MAP_SHARED;
  507         else
  508                 nargs.flags |= MAP_PRIVATE;
  509         if (uap->flags & OMAP_FIXED)
  510                 nargs.flags |= MAP_FIXED;
  511         nargs.fd = uap->fd;
  512         nargs.pos = uap->pos;
  513         return (mmap(td, &nargs));
  514 }
  515 #endif                          /* COMPAT_43 */
  516 
  517 
  518 #ifndef _SYS_SYSPROTO_H_
  519 struct msync_args {
  520         void *addr;
  521         int len;
  522         int flags;
  523 };
  524 #endif
  525 /*
  526  * MPSAFE
  527  */
  528 int
  529 msync(td, uap)
  530         struct thread *td;
  531         struct msync_args *uap;
  532 {
  533         vm_offset_t addr;
  534         vm_size_t size, pageoff;
  535         int flags;
  536         vm_map_t map;
  537         int rv;
  538 
  539         addr = (vm_offset_t) uap->addr;
  540         size = uap->len;
  541         flags = uap->flags;
  542 
  543         pageoff = (addr & PAGE_MASK);
  544         addr -= pageoff;
  545         size += pageoff;
  546         size = (vm_size_t) round_page(size);
  547         if (addr + size < addr)
  548                 return (EINVAL);
  549 
  550         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  551                 return (EINVAL);
  552 
  553         map = &td->td_proc->p_vmspace->vm_map;
  554 
  555         /*
  556          * Clean the pages and interpret the return value.
  557          */
  558         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  559             (flags & MS_INVALIDATE) != 0);
  560         switch (rv) {
  561         case KERN_SUCCESS:
  562                 return (0);
  563         case KERN_INVALID_ADDRESS:
  564                 return (EINVAL);        /* Sun returns ENOMEM? */
  565         case KERN_INVALID_ARGUMENT:
  566                 return (EBUSY);
  567         default:
  568                 return (EINVAL);
  569         }
  570 }
  571 
  572 #ifndef _SYS_SYSPROTO_H_
  573 struct munmap_args {
  574         void *addr;
  575         size_t len;
  576 };
  577 #endif
  578 /*
  579  * MPSAFE
  580  */
  581 int
  582 munmap(td, uap)
  583         struct thread *td;
  584         struct munmap_args *uap;
  585 {
  586         vm_offset_t addr;
  587         vm_size_t size, pageoff;
  588         vm_map_t map;
  589 
  590         addr = (vm_offset_t) uap->addr;
  591         size = uap->len;
  592         if (size == 0)
  593                 return (EINVAL);
  594 
  595         pageoff = (addr & PAGE_MASK);
  596         addr -= pageoff;
  597         size += pageoff;
  598         size = (vm_size_t) round_page(size);
  599         if (addr + size < addr)
  600                 return (EINVAL);
  601 
  602         /*
  603          * Check for illegal addresses.  Watch out for address wrap...
  604          */
  605         map = &td->td_proc->p_vmspace->vm_map;
  606         if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
  607                 return (EINVAL);
  608         vm_map_lock(map);
  609         /*
  610          * Make sure entire range is allocated.
  611          */
  612         if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
  613                 vm_map_unlock(map);
  614                 return (EINVAL);
  615         }
  616         /* returns nothing but KERN_SUCCESS anyway */
  617         vm_map_delete(map, addr, addr + size);
  618         vm_map_unlock(map);
  619         return (0);
  620 }
  621 
  622 #ifndef _SYS_SYSPROTO_H_
  623 struct mprotect_args {
  624         const void *addr;
  625         size_t len;
  626         int prot;
  627 };
  628 #endif
  629 /*
  630  * MPSAFE
  631  */
  632 int
  633 mprotect(td, uap)
  634         struct thread *td;
  635         struct mprotect_args *uap;
  636 {
  637         vm_offset_t addr;
  638         vm_size_t size, pageoff;
  639         vm_prot_t prot;
  640 
  641         addr = (vm_offset_t) uap->addr;
  642         size = uap->len;
  643         prot = uap->prot & VM_PROT_ALL;
  644 #if defined(VM_PROT_READ_IS_EXEC)
  645         if (prot & VM_PROT_READ)
  646                 prot |= VM_PROT_EXECUTE;
  647 #endif
  648 
  649         pageoff = (addr & PAGE_MASK);
  650         addr -= pageoff;
  651         size += pageoff;
  652         size = (vm_size_t) round_page(size);
  653         if (addr + size < addr)
  654                 return (EINVAL);
  655 
  656         switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
  657             addr + size, prot, FALSE)) {
  658         case KERN_SUCCESS:
  659                 return (0);
  660         case KERN_PROTECTION_FAILURE:
  661                 return (EACCES);
  662         }
  663         return (EINVAL);
  664 }
  665 
  666 #ifndef _SYS_SYSPROTO_H_
  667 struct minherit_args {
  668         void *addr;
  669         size_t len;
  670         int inherit;
  671 };
  672 #endif
  673 /*
  674  * MPSAFE
  675  */
  676 int
  677 minherit(td, uap)
  678         struct thread *td;
  679         struct minherit_args *uap;
  680 {
  681         vm_offset_t addr;
  682         vm_size_t size, pageoff;
  683         vm_inherit_t inherit;
  684 
  685         addr = (vm_offset_t)uap->addr;
  686         size = uap->len;
  687         inherit = uap->inherit;
  688 
  689         pageoff = (addr & PAGE_MASK);
  690         addr -= pageoff;
  691         size += pageoff;
  692         size = (vm_size_t) round_page(size);
  693         if (addr + size < addr)
  694                 return (EINVAL);
  695 
  696         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  697             addr + size, inherit)) {
  698         case KERN_SUCCESS:
  699                 return (0);
  700         case KERN_PROTECTION_FAILURE:
  701                 return (EACCES);
  702         }
  703         return (EINVAL);
  704 }
  705 
  706 #ifndef _SYS_SYSPROTO_H_
  707 struct madvise_args {
  708         void *addr;
  709         size_t len;
  710         int behav;
  711 };
  712 #endif
  713 
  714 /*
  715  * MPSAFE
  716  */
  717 /* ARGSUSED */
  718 int
  719 madvise(td, uap)
  720         struct thread *td;
  721         struct madvise_args *uap;
  722 {
  723         vm_offset_t start, end;
  724         vm_map_t map;
  725         struct proc *p;
  726         int error;
  727 
  728         /*
  729          * Check for our special case, advising the swap pager we are
  730          * "immortal."
  731          */
  732         if (uap->behav == MADV_PROTECT) {
  733                 error = suser(td);
  734                 if (error == 0) {
  735                         p = td->td_proc;
  736                         PROC_LOCK(p);
  737                         p->p_flag |= P_PROTECTED;
  738                         PROC_UNLOCK(p);
  739                 }
  740                 return (error);
  741         }
  742         /*
  743          * Check for illegal behavior
  744          */
  745         if (uap->behav < 0 || uap->behav > MADV_CORE)
  746                 return (EINVAL);
  747         /*
  748          * Check for illegal addresses.  Watch out for address wrap... Note
  749          * that VM_*_ADDRESS are not constants due to casts (argh).
  750          */
  751         map = &td->td_proc->p_vmspace->vm_map;
  752         if ((vm_offset_t)uap->addr < vm_map_min(map) ||
  753             (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
  754                 return (EINVAL);
  755         if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
  756                 return (EINVAL);
  757 
  758         /*
  759          * Since this routine is only advisory, we default to conservative
  760          * behavior.
  761          */
  762         start = trunc_page((vm_offset_t) uap->addr);
  763         end = round_page((vm_offset_t) uap->addr + uap->len);
  764         
  765         if (vm_map_madvise(map, start, end, uap->behav))
  766                 return (EINVAL);
  767         return (0);
  768 }
  769 
  770 #ifndef _SYS_SYSPROTO_H_
  771 struct mincore_args {
  772         const void *addr;
  773         size_t len;
  774         char *vec;
  775 };
  776 #endif
  777 
  778 /*
  779  * MPSAFE
  780  */
  781 /* ARGSUSED */
  782 int
  783 mincore(td, uap)
  784         struct thread *td;
  785         struct mincore_args *uap;
  786 {
  787         vm_offset_t addr, first_addr;
  788         vm_offset_t end, cend;
  789         pmap_t pmap;
  790         vm_map_t map;
  791         char *vec;
  792         int error = 0;
  793         int vecindex, lastvecindex;
  794         vm_map_entry_t current;
  795         vm_map_entry_t entry;
  796         int mincoreinfo;
  797         unsigned int timestamp;
  798 
  799         /*
  800          * Make sure that the addresses presented are valid for user
  801          * mode.
  802          */
  803         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  804         end = addr + (vm_size_t)round_page(uap->len);
  805         map = &td->td_proc->p_vmspace->vm_map;
  806         if (end > vm_map_max(map) || end < addr)
  807                 return (EINVAL);
  808 
  809         /*
  810          * Address of byte vector
  811          */
  812         vec = uap->vec;
  813 
  814         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  815 
  816         vm_map_lock_read(map);
  817 RestartScan:
  818         timestamp = map->timestamp;
  819 
  820         if (!vm_map_lookup_entry(map, addr, &entry))
  821                 entry = entry->next;
  822 
  823         /*
  824          * Do this on a map entry basis so that if the pages are not
  825          * in the current processes address space, we can easily look
  826          * up the pages elsewhere.
  827          */
  828         lastvecindex = -1;
  829         for (current = entry;
  830             (current != &map->header) && (current->start < end);
  831             current = current->next) {
  832 
  833                 /*
  834                  * ignore submaps (for now) or null objects
  835                  */
  836                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  837                         current->object.vm_object == NULL)
  838                         continue;
  839                 
  840                 /*
  841                  * limit this scan to the current map entry and the
  842                  * limits for the mincore call
  843                  */
  844                 if (addr < current->start)
  845                         addr = current->start;
  846                 cend = current->end;
  847                 if (cend > end)
  848                         cend = end;
  849 
  850                 /*
  851                  * scan this entry one page at a time
  852                  */
  853                 while (addr < cend) {
  854                         /*
  855                          * Check pmap first, it is likely faster, also
  856                          * it can provide info as to whether we are the
  857                          * one referencing or modifying the page.
  858                          */
  859                         mtx_lock(&Giant);
  860                         mincoreinfo = pmap_mincore(pmap, addr);
  861                         mtx_unlock(&Giant);
  862                         if (!mincoreinfo) {
  863                                 vm_pindex_t pindex;
  864                                 vm_ooffset_t offset;
  865                                 vm_page_t m;
  866                                 /*
  867                                  * calculate the page index into the object
  868                                  */
  869                                 offset = current->offset + (addr - current->start);
  870                                 pindex = OFF_TO_IDX(offset);
  871                                 VM_OBJECT_LOCK(current->object.vm_object);
  872                                 m = vm_page_lookup(current->object.vm_object,
  873                                         pindex);
  874                                 /*
  875                                  * if the page is resident, then gather information about
  876                                  * it.
  877                                  */
  878                                 if (m) {
  879                                         mincoreinfo = MINCORE_INCORE;
  880                                         vm_page_lock_queues();
  881                                         if (m->dirty ||
  882                                                 pmap_is_modified(m))
  883                                                 mincoreinfo |= MINCORE_MODIFIED_OTHER;
  884                                         if ((m->flags & PG_REFERENCED) ||
  885                                                 pmap_ts_referenced(m)) {
  886                                                 vm_page_flag_set(m, PG_REFERENCED);
  887                                                 mincoreinfo |= MINCORE_REFERENCED_OTHER;
  888                                         }
  889                                         vm_page_unlock_queues();
  890                                 }
  891                                 VM_OBJECT_UNLOCK(current->object.vm_object);
  892                         }
  893 
  894                         /*
  895                          * subyte may page fault.  In case it needs to modify
  896                          * the map, we release the lock.
  897                          */
  898                         vm_map_unlock_read(map);
  899 
  900                         /*
  901                          * calculate index into user supplied byte vector
  902                          */
  903                         vecindex = OFF_TO_IDX(addr - first_addr);
  904 
  905                         /*
  906                          * If we have skipped map entries, we need to make sure that
  907                          * the byte vector is zeroed for those skipped entries.
  908                          */
  909                         while ((lastvecindex + 1) < vecindex) {
  910                                 error = subyte(vec + lastvecindex, 0);
  911                                 if (error) {
  912                                         error = EFAULT;
  913                                         goto done2;
  914                                 }
  915                                 ++lastvecindex;
  916                         }
  917 
  918                         /*
  919                          * Pass the page information to the user
  920                          */
  921                         error = subyte(vec + vecindex, mincoreinfo);
  922                         if (error) {
  923                                 error = EFAULT;
  924                                 goto done2;
  925                         }
  926 
  927                         /*
  928                          * If the map has changed, due to the subyte, the previous
  929                          * output may be invalid.
  930                          */
  931                         vm_map_lock_read(map);
  932                         if (timestamp != map->timestamp)
  933                                 goto RestartScan;
  934 
  935                         lastvecindex = vecindex;
  936                         addr += PAGE_SIZE;
  937                 }
  938         }
  939 
  940         /*
  941          * subyte may page fault.  In case it needs to modify
  942          * the map, we release the lock.
  943          */
  944         vm_map_unlock_read(map);
  945 
  946         /*
  947          * Zero the last entries in the byte vector.
  948          */
  949         vecindex = OFF_TO_IDX(end - first_addr);
  950         while ((lastvecindex + 1) < vecindex) {
  951                 error = subyte(vec + lastvecindex, 0);
  952                 if (error) {
  953                         error = EFAULT;
  954                         goto done2;
  955                 }
  956                 ++lastvecindex;
  957         }
  958         
  959         /*
  960          * If the map has changed, due to the subyte, the previous
  961          * output may be invalid.
  962          */
  963         vm_map_lock_read(map);
  964         if (timestamp != map->timestamp)
  965                 goto RestartScan;
  966         vm_map_unlock_read(map);
  967 done2:
  968         return (error);
  969 }
  970 
  971 #ifndef _SYS_SYSPROTO_H_
  972 struct mlock_args {
  973         const void *addr;
  974         size_t len;
  975 };
  976 #endif
  977 /*
  978  * MPSAFE
  979  */
  980 int
  981 mlock(td, uap)
  982         struct thread *td;
  983         struct mlock_args *uap;
  984 {
  985         vm_offset_t addr;
  986         vm_size_t size, pageoff;
  987         int error;
  988 
  989         addr = (vm_offset_t) uap->addr;
  990         size = uap->len;
  991 
  992         pageoff = (addr & PAGE_MASK);
  993         addr -= pageoff;
  994         size += pageoff;
  995         size = (vm_size_t) round_page(size);
  996 
  997         /* disable wrap around */
  998         if (addr + size < addr)
  999                 return (EINVAL);
 1000 
 1001         if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
 1002                 return (EAGAIN);
 1003 
 1004 #if 0
 1005         if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
 1006             td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
 1007                 return (ENOMEM);
 1008 #else
 1009         error = suser(td);
 1010         if (error)
 1011                 return (error);
 1012 #endif
 1013 
 1014         error = vm_map_wire(&td->td_proc->p_vmspace->vm_map, addr,
 1015                      addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 1016         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1017 }
 1018 
 1019 #ifndef _SYS_SYSPROTO_H_
 1020 struct mlockall_args {
 1021         int     how;
 1022 };
 1023 #endif
 1024 
 1025 /*
 1026  * MPSAFE
 1027  */
 1028 int
 1029 mlockall(td, uap)
 1030         struct thread *td;
 1031         struct mlockall_args *uap;
 1032 {
 1033         vm_map_t map;
 1034         int error;
 1035 
 1036         map = &td->td_proc->p_vmspace->vm_map;
 1037         error = 0;
 1038 
 1039         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 1040                 return (EINVAL);
 1041 
 1042 #if 0
 1043         /*
 1044          * If wiring all pages in the process would cause it to exceed
 1045          * a hard resource limit, return ENOMEM.
 1046          */
 1047         if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
 1048                 td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur))
 1049                 return (ENOMEM);
 1050 #else
 1051         error = suser(td);
 1052         if (error)
 1053                 return (error);
 1054 #endif
 1055 
 1056         if (uap->how & MCL_FUTURE) {
 1057                 vm_map_lock(map);
 1058                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
 1059                 vm_map_unlock(map);
 1060                 error = 0;
 1061         }
 1062 
 1063         if (uap->how & MCL_CURRENT) {
 1064                 /*
 1065                  * P1003.1-2001 mandates that all currently mapped pages
 1066                  * will be memory resident and locked (wired) upon return
 1067                  * from mlockall(). vm_map_wire() will wire pages, by
 1068                  * calling vm_fault_wire() for each page in the region.
 1069                  */
 1070                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 1071                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1072                 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 1073         }
 1074 
 1075         return (error);
 1076 }
 1077 
 1078 #ifndef _SYS_SYSPROTO_H_
 1079 struct munlockall_args {
 1080         register_t dummy;
 1081 };
 1082 #endif
 1083 
 1084 /*
 1085  * MPSAFE
 1086  */
 1087 int
 1088 munlockall(td, uap)
 1089         struct thread *td;
 1090         struct munlockall_args *uap;
 1091 {
 1092         vm_map_t map;
 1093         int error;
 1094 
 1095         map = &td->td_proc->p_vmspace->vm_map;
 1096         error = suser(td);
 1097         if (error)
 1098                 return (error);
 1099 
 1100         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1101         vm_map_lock(map);
 1102         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1103         vm_map_unlock(map);
 1104 
 1105         /* Forcibly unwire all pages. */
 1106         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1107             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1108 
 1109         return (error);
 1110 }
 1111 
 1112 #ifndef _SYS_SYSPROTO_H_
 1113 struct munlock_args {
 1114         const void *addr;
 1115         size_t len;
 1116 };
 1117 #endif
 1118 /*
 1119  * MPSAFE
 1120  */
 1121 int
 1122 munlock(td, uap)
 1123         struct thread *td;
 1124         struct munlock_args *uap;
 1125 {
 1126         vm_offset_t addr;
 1127         vm_size_t size, pageoff;
 1128         int error;
 1129 
 1130         addr = (vm_offset_t) uap->addr;
 1131         size = uap->len;
 1132 
 1133         pageoff = (addr & PAGE_MASK);
 1134         addr -= pageoff;
 1135         size += pageoff;
 1136         size = (vm_size_t) round_page(size);
 1137 
 1138         /* disable wrap around */
 1139         if (addr + size < addr)
 1140                 return (EINVAL);
 1141 
 1142         error = suser(td);
 1143         if (error)
 1144                 return (error);
 1145 
 1146         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, addr,
 1147                      addr + size, VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 1148         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1149 }
 1150 
 1151 /*
 1152  * vm_mmap()
 1153  *
 1154  * MPSAFE
 1155  *
 1156  * Internal version of mmap.  Currently used by mmap, exec, and sys5
 1157  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
 1158  */
 1159 int
 1160 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1161         vm_prot_t maxprot, int flags,
 1162         void *handle,
 1163         vm_ooffset_t foff)
 1164 {
 1165         boolean_t fitit;
 1166         vm_object_t object;
 1167         struct vnode *vp = NULL;
 1168         objtype_t type;
 1169         int rv = KERN_SUCCESS;
 1170         vm_ooffset_t objsize;
 1171         int docow, error;
 1172         struct thread *td = curthread;
 1173 
 1174         if (size == 0)
 1175                 return (0);
 1176 
 1177         objsize = size = round_page(size);
 1178 
 1179         if (td->td_proc->p_vmspace->vm_map.size + size >
 1180             td->td_proc->p_rlimit[RLIMIT_VMEM].rlim_cur) {
 1181                 return(ENOMEM);
 1182         }
 1183 
 1184         /*
 1185          * We currently can only deal with page aligned file offsets.
 1186          * The check is here rather than in the syscall because the
 1187          * kernel calls this function internally for other mmaping
 1188          * operations (such as in exec) and non-aligned offsets will
 1189          * cause pmap inconsistencies...so we want to be sure to
 1190          * disallow this in all cases.
 1191          */
 1192         if (foff & PAGE_MASK)
 1193                 return (EINVAL);
 1194 
 1195         if ((flags & MAP_FIXED) == 0) {
 1196                 fitit = TRUE;
 1197                 *addr = round_page(*addr);
 1198         } else {
 1199                 if (*addr != trunc_page(*addr))
 1200                         return (EINVAL);
 1201                 fitit = FALSE;
 1202                 (void) vm_map_remove(map, *addr, *addr + size);
 1203         }
 1204 
 1205         /*
 1206          * Lookup/allocate object.
 1207          */
 1208         if (flags & MAP_ANON) {
 1209                 type = OBJT_DEFAULT;
 1210                 /*
 1211                  * Unnamed anonymous regions always start at 0.
 1212                  */
 1213                 if (handle == 0)
 1214                         foff = 0;
 1215         } else {
 1216                 vp = (struct vnode *) handle;
 1217                 mtx_lock(&Giant);
 1218                 error = vget(vp, LK_EXCLUSIVE, td);
 1219                 if (error) {
 1220                         mtx_unlock(&Giant);
 1221                         return (error);
 1222                 }
 1223                 if (vp->v_type == VCHR) {
 1224                         type = OBJT_DEVICE;
 1225                         handle = vp->v_rdev;
 1226                         vput(vp);
 1227                         mtx_unlock(&Giant);
 1228                 } else {
 1229                         struct vattr vat;
 1230 
 1231                         error = VOP_GETATTR(vp, &vat, td->td_ucred, td);
 1232                         if (error) {
 1233                                 vput(vp);
 1234                                 mtx_unlock(&Giant);
 1235                                 return (error);
 1236                         }
 1237                         objsize = round_page(vat.va_size);
 1238                         type = OBJT_VNODE;
 1239                         /*
 1240                          * if it is a regular file without any references
 1241                          * we do not need to sync it.
 1242                          */
 1243                         if (vp->v_type == VREG && vat.va_nlink == 0) {
 1244                                 flags |= MAP_NOSYNC;
 1245                         }
 1246                 }
 1247         }
 1248 
 1249         if (handle == NULL) {
 1250                 object = NULL;
 1251                 docow = 0;
 1252         } else {
 1253                 object = vm_pager_allocate(type,
 1254                         handle, objsize, prot, foff);
 1255                 if (type == OBJT_VNODE) {
 1256                         vput(vp);
 1257                         mtx_unlock(&Giant);
 1258                 }
 1259                 if (object == NULL) {
 1260                         return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 1261                 }
 1262                 docow = MAP_PREFAULT_PARTIAL;
 1263         }
 1264 
 1265         /*
 1266          * Force device mappings to be shared.
 1267          */
 1268         if (type == OBJT_DEVICE) {
 1269                 flags &= ~(MAP_PRIVATE|MAP_COPY);
 1270                 flags |= MAP_SHARED;
 1271         }
 1272 
 1273         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1274                 docow |= MAP_COPY_ON_WRITE;
 1275         if (flags & MAP_NOSYNC)
 1276                 docow |= MAP_DISABLE_SYNCER;
 1277         if (flags & MAP_NOCORE)
 1278                 docow |= MAP_DISABLE_COREDUMP;
 1279 
 1280 #if defined(VM_PROT_READ_IS_EXEC)
 1281         if (prot & VM_PROT_READ)
 1282                 prot |= VM_PROT_EXECUTE;
 1283 
 1284         if (maxprot & VM_PROT_READ)
 1285                 maxprot |= VM_PROT_EXECUTE;
 1286 #endif
 1287 
 1288         if (fitit)
 1289                 *addr = pmap_addr_hint(object, *addr, size);
 1290 
 1291         if (flags & MAP_STACK)
 1292                 rv = vm_map_stack(map, *addr, size, prot, maxprot,
 1293                     docow | MAP_STACK_GROWS_DOWN);
 1294         else
 1295                 rv = vm_map_find(map, object, foff, addr, size, fitit,
 1296                                  prot, maxprot, docow);
 1297 
 1298         if (rv != KERN_SUCCESS) {
 1299                 /*
 1300                  * Lose the object reference. Will destroy the
 1301                  * object if it's an unnamed anonymous mapping
 1302                  * or named anonymous without other references.
 1303                  */
 1304                 vm_object_deallocate(object);
 1305         } else if (flags & MAP_SHARED) {
 1306                 /*
 1307                  * Shared memory is also shared with children.
 1308                  */
 1309                 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 1310                 if (rv != KERN_SUCCESS)
 1311                         (void) vm_map_remove(map, *addr, *addr + size);
 1312         }
 1313 
 1314         /*
 1315          * If the process has requested that all future mappings
 1316          * be wired, then heed this.
 1317          */
 1318         if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
 1319                 vm_map_wire(map, *addr, *addr + size,
 1320                     VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 1321 
 1322         switch (rv) {
 1323         case KERN_SUCCESS:
 1324                 return (0);
 1325         case KERN_INVALID_ADDRESS:
 1326         case KERN_NO_SPACE:
 1327                 return (ENOMEM);
 1328         case KERN_PROTECTION_FAILURE:
 1329                 return (EACCES);
 1330         default:
 1331                 return (EINVAL);
 1332         }
 1333 }

Cache object: b26308615d544f6ca2a0a5fe486824d7


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.