The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * (MPSAFE)
    3  *
    4  * Copyright (c) 1988 University of Utah.
    5  * Copyright (c) 1991, 1993
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * This code is derived from software contributed to Berkeley by
    9  * the Systems Programming Group of the University of Utah Computer
   10  * Science Department.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   37  *
   38  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   39  * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $
   40  */
   41 
   42 /*
   43  * Mapped file (mmap) interface to VM
   44  */
   45 
   46 #include <sys/param.h>
   47 #include <sys/kernel.h>
   48 #include <sys/systm.h>
   49 #include <sys/sysproto.h>
   50 #include <sys/filedesc.h>
   51 #include <sys/kern_syscall.h>
   52 #include <sys/proc.h>
   53 #include <sys/priv.h>
   54 #include <sys/resource.h>
   55 #include <sys/resourcevar.h>
   56 #include <sys/vnode.h>
   57 #include <sys/fcntl.h>
   58 #include <sys/file.h>
   59 #include <sys/mman.h>
   60 #include <sys/conf.h>
   61 #include <sys/stat.h>
   62 #include <sys/vmmeter.h>
   63 #include <sys/sysctl.h>
   64 
   65 #include <vm/vm.h>
   66 #include <vm/vm_param.h>
   67 #include <sys/lock.h>
   68 #include <vm/pmap.h>
   69 #include <vm/vm_map.h>
   70 #include <vm/vm_object.h>
   71 #include <vm/vm_page.h>
   72 #include <vm/vm_pager.h>
   73 #include <vm/vm_pageout.h>
   74 #include <vm/vm_extern.h>
   75 #include <vm/vm_kern.h>
   76 
   77 #include <sys/file2.h>
   78 #include <sys/thread.h>
   79 #include <sys/thread2.h>
   80 #include <vm/vm_page2.h>
   81 
   82 static int max_proc_mmap;
   83 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
   84 int vkernel_enable;
   85 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, "");
   86 
   87 /*
   88  * Set the maximum number of vm_map_entry structures per process.  Roughly
   89  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
   90  * of our KVM malloc space still results in generous limits.  We want a 
   91  * default that is good enough to prevent the kernel running out of resources
   92  * if attacked from compromised user account but generous enough such that
   93  * multi-threaded processes are not unduly inconvenienced.
   94  */
   95 
   96 static void vmmapentry_rsrc_init (void *);
   97 SYSINIT(vmmersrc, SI_BOOT1_POST, SI_ORDER_ANY, vmmapentry_rsrc_init, NULL)
   98 
   99 static void
  100 vmmapentry_rsrc_init(void *dummy)
  101 {
  102     max_proc_mmap = KvaSize / sizeof(struct vm_map_entry);
  103     max_proc_mmap /= 100;
  104 }
  105 
  106 /*
  107  * MPSAFE
  108  */
  109 int
  110 sys_sbrk(struct sbrk_args *uap)
  111 {
  112         /* Not yet implemented */
  113         return (EOPNOTSUPP);
  114 }
  115 
  116 /*
  117  * sstk_args(int incr)
  118  *
  119  * MPSAFE
  120  */
  121 int
  122 sys_sstk(struct sstk_args *uap)
  123 {
  124         /* Not yet implemented */
  125         return (EOPNOTSUPP);
  126 }
  127 
  128 /* 
  129  * mmap_args(void *addr, size_t len, int prot, int flags, int fd,
  130  *              long pad, off_t pos)
  131  *
  132  * Memory Map (mmap) system call.  Note that the file offset
  133  * and address are allowed to be NOT page aligned, though if
  134  * the MAP_FIXED flag it set, both must have the same remainder
  135  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  136  * page-aligned, the actual mapping starts at trunc_page(addr)
  137  * and the return value is adjusted up by the page offset.
  138  *
  139  * Generally speaking, only character devices which are themselves
  140  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  141  * there would be no cache coherency between a descriptor and a VM mapping
  142  * both to the same character device.
  143  *
  144  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  145  * is maintained as long as you do not write directly to the underlying
  146  * character device.
  147  *
  148  * No requirements
  149  */
  150 int
  151 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen,
  152           int uprot, int uflags, int fd, off_t upos, void **res)
  153 {
  154         struct thread *td = curthread;
  155         struct proc *p = td->td_proc;
  156         struct file *fp = NULL;
  157         struct vnode *vp;
  158         vm_offset_t addr;
  159         vm_offset_t tmpaddr;
  160         vm_size_t size, pageoff;
  161         vm_prot_t prot, maxprot;
  162         void *handle;
  163         int flags, error;
  164         off_t pos;
  165         vm_object_t obj;
  166 
  167         KKASSERT(p);
  168 
  169         addr = (vm_offset_t) uaddr;
  170         size = ulen;
  171         prot = uprot & VM_PROT_ALL;
  172         flags = uflags;
  173         pos = upos;
  174 
  175         /*
  176          * Make sure mapping fits into numeric range etc.
  177          *
  178          * NOTE: We support the full unsigned range for size now.
  179          */
  180         if (((flags & MAP_ANON) && (fd != -1 || pos != 0)))
  181                 return (EINVAL);
  182 
  183         if (size == 0)
  184                 return (EINVAL);
  185 
  186         if (flags & MAP_STACK) {
  187                 if ((fd != -1) ||
  188                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  189                         return (EINVAL);
  190                 flags |= MAP_ANON;
  191                 pos = 0;
  192         }
  193 
  194         /*
  195          * Virtual page tables cannot be used with MAP_STACK.  Apart from
  196          * it not making any sense, the aux union is used by both
  197          * types.
  198          *
  199          * Because the virtual page table is stored in the backing object
  200          * and might be updated by the kernel, the mapping must be R+W.
  201          */
  202         if (flags & MAP_VPAGETABLE) {
  203                 if (vkernel_enable == 0)
  204                         return (EOPNOTSUPP);
  205                 if (flags & MAP_STACK)
  206                         return (EINVAL);
  207                 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE))
  208                         return (EINVAL);
  209         }
  210 
  211         /*
  212          * Align the file position to a page boundary,
  213          * and save its page offset component.
  214          */
  215         pageoff = (pos & PAGE_MASK);
  216         pos -= pageoff;
  217 
  218         /* Adjust size for rounding (on both ends). */
  219         size += pageoff;                        /* low end... */
  220         size = (vm_size_t) round_page(size);    /* hi end */
  221         if (size < ulen)                        /* wrap */
  222                 return(EINVAL);
  223 
  224         /*
  225          * Check for illegal addresses.  Watch out for address wrap... Note
  226          * that VM_*_ADDRESS are not constants due to casts (argh).
  227          */
  228         if (flags & (MAP_FIXED | MAP_TRYFIXED)) {
  229                 /*
  230                  * The specified address must have the same remainder
  231                  * as the file offset taken modulo PAGE_SIZE, so it
  232                  * should be aligned after adjustment by pageoff.
  233                  */
  234                 addr -= pageoff;
  235                 if (addr & PAGE_MASK)
  236                         return (EINVAL);
  237 
  238                 /*
  239                  * Address range must be all in user VM space and not wrap.
  240                  */
  241                 tmpaddr = addr + size;
  242                 if (tmpaddr < addr)
  243                         return (EINVAL);
  244                 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
  245                         return (EINVAL);
  246                 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS)
  247                         return (EINVAL);
  248         } else {
  249                 /*
  250                  * Get a hint of where to map. It also provides mmap offset
  251                  * randomization if enabled.
  252                  */
  253                 addr = vm_map_hint(p, addr, prot);
  254         }
  255 
  256         if (flags & MAP_ANON) {
  257                 /*
  258                  * Mapping blank space is trivial.
  259                  */
  260                 handle = NULL;
  261                 maxprot = VM_PROT_ALL;
  262         } else {
  263                 /*
  264                  * Mapping file, get fp for validation. Obtain vnode and make
  265                  * sure it is of appropriate type.
  266                  */
  267                 fp = holdfp(p->p_fd, fd, -1);
  268                 if (fp == NULL)
  269                         return (EBADF);
  270                 if (fp->f_type != DTYPE_VNODE) {
  271                         error = EINVAL;
  272                         goto done;
  273                 }
  274                 /*
  275                  * POSIX shared-memory objects are defined to have
  276                  * kernel persistence, and are not defined to support
  277                  * read(2)/write(2) -- or even open(2).  Thus, we can
  278                  * use MAP_ASYNC to trade on-disk coherence for speed.
  279                  * The shm_open(3) library routine turns on the FPOSIXSHM
  280                  * flag to request this behavior.
  281                  */
  282                 if (fp->f_flag & FPOSIXSHM)
  283                         flags |= MAP_NOSYNC;
  284                 vp = (struct vnode *) fp->f_data;
  285 
  286                 /*
  287                  * Validate the vnode for the operation.
  288                  */
  289                 switch(vp->v_type) {
  290                 case VREG:
  291                         /*
  292                          * Get the proper underlying object
  293                          */
  294                         if ((obj = vp->v_object) == NULL) {
  295                                 error = EINVAL;
  296                                 goto done;
  297                         }
  298                         KKASSERT((struct vnode *)obj->handle == vp);
  299                         break;
  300                 case VCHR:
  301                         /*
  302                          * Make sure a device has not been revoked.  
  303                          * Mappability is handled by the device layer.
  304                          */
  305                         if (vp->v_rdev == NULL) {
  306                                 error = EBADF;
  307                                 goto done;
  308                         }
  309                         break;
  310                 default:
  311                         /*
  312                          * Nothing else is mappable.
  313                          */
  314                         error = EINVAL;
  315                         goto done;
  316                 }
  317 
  318                 /*
  319                  * XXX hack to handle use of /dev/zero to map anon memory (ala
  320                  * SunOS).
  321                  */
  322                 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
  323                         handle = NULL;
  324                         maxprot = VM_PROT_ALL;
  325                         flags |= MAP_ANON;
  326                         pos = 0;
  327                 } else {
  328                         /*
  329                          * cdevs does not provide private mappings of any kind.
  330                          */
  331                         if (vp->v_type == VCHR &&
  332                             (flags & (MAP_PRIVATE|MAP_COPY))) {
  333                                 error = EINVAL;
  334                                 goto done;
  335                         }
  336                         /*
  337                          * Ensure that file and memory protections are
  338                          * compatible.  Note that we only worry about
  339                          * writability if mapping is shared; in this case,
  340                          * current and max prot are dictated by the open file.
  341                          * XXX use the vnode instead?  Problem is: what
  342                          * credentials do we use for determination? What if
  343                          * proc does a setuid?
  344                          */
  345                         maxprot = VM_PROT_EXECUTE;      /* ??? */
  346                         if (fp->f_flag & FREAD) {
  347                                 maxprot |= VM_PROT_READ;
  348                         } else if (prot & PROT_READ) {
  349                                 error = EACCES;
  350                                 goto done;
  351                         }
  352                         /*
  353                          * If we are sharing potential changes (either via
  354                          * MAP_SHARED or via the implicit sharing of character
  355                          * device mappings), and we are trying to get write
  356                          * permission although we opened it without asking
  357                          * for it, bail out.  Check for superuser, only if
  358                          * we're at securelevel < 1, to allow the XIG X server
  359                          * to continue to work.
  360                          */
  361                         if ((flags & MAP_SHARED) != 0 || vp->v_type == VCHR) {
  362                                 if ((fp->f_flag & FWRITE) != 0) {
  363                                         struct vattr va;
  364                                         if ((error = VOP_GETATTR(vp, &va))) {
  365                                                 goto done;
  366                                         }
  367                                         if ((va.va_flags &
  368                                             (IMMUTABLE|APPEND)) == 0) {
  369                                                 maxprot |= VM_PROT_WRITE;
  370                                         } else if (prot & PROT_WRITE) {
  371                                                 error = EPERM;
  372                                                 goto done;
  373                                         }
  374                                 } else if ((prot & PROT_WRITE) != 0) {
  375                                         error = EACCES;
  376                                         goto done;
  377                                 }
  378                         } else {
  379                                 maxprot |= VM_PROT_WRITE;
  380                         }
  381                         handle = (void *)vp;
  382                 }
  383         }
  384 
  385         lwkt_gettoken(&vms->vm_map.token);
  386 
  387         /*
  388          * Do not allow more then a certain number of vm_map_entry structures
  389          * per process.  Scale with the number of rforks sharing the map
  390          * to make the limit reasonable for threads.
  391          */
  392         if (max_proc_mmap && 
  393             vms->vm_map.nentries >= max_proc_mmap * vms->vm_sysref.refcnt) {
  394                 error = ENOMEM;
  395                 lwkt_reltoken(&vms->vm_map.token);
  396                 goto done;
  397         }
  398 
  399         error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  400                         flags, handle, pos);
  401         if (error == 0)
  402                 *res = (void *)(addr + pageoff);
  403 
  404         lwkt_reltoken(&vms->vm_map.token);
  405 done:
  406         if (fp)
  407                 fdrop(fp);
  408 
  409         return (error);
  410 }
  411 
  412 /*
  413  * mmap system call handler
  414  *
  415  * No requirements.
  416  */
  417 int
  418 sys_mmap(struct mmap_args *uap)
  419 {
  420         int error;
  421 
  422         error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len,
  423                           uap->prot, uap->flags,
  424                           uap->fd, uap->pos, &uap->sysmsg_resultp);
  425 
  426         return (error);
  427 }
  428 
  429 /*
  430  * msync system call handler 
  431  *
  432  * msync_args(void *addr, size_t len, int flags)
  433  *
  434  * No requirements
  435  */
  436 int
  437 sys_msync(struct msync_args *uap)
  438 {
  439         struct proc *p = curproc;
  440         vm_offset_t addr;
  441         vm_offset_t tmpaddr;
  442         vm_size_t size, pageoff;
  443         int flags;
  444         vm_map_t map;
  445         int rv;
  446 
  447         addr = (vm_offset_t) uap->addr;
  448         size = uap->len;
  449         flags = uap->flags;
  450 
  451         pageoff = (addr & PAGE_MASK);
  452         addr -= pageoff;
  453         size += pageoff;
  454         size = (vm_size_t) round_page(size);
  455         if (size < uap->len)            /* wrap */
  456                 return(EINVAL);
  457         tmpaddr = addr + size;          /* workaround gcc4 opt */
  458         if (tmpaddr < addr)             /* wrap */
  459                 return(EINVAL);
  460 
  461         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  462                 return (EINVAL);
  463 
  464         map = &p->p_vmspace->vm_map;
  465 
  466         /*
  467          * map->token serializes extracting the address range for size == 0
  468          * msyncs with the vm_map_clean call; if the token were not held
  469          * across the two calls, an intervening munmap/mmap pair, for example,
  470          * could cause msync to occur on a wrong region.
  471          */
  472         lwkt_gettoken(&map->token);
  473 
  474         /*
  475          * XXX Gak!  If size is zero we are supposed to sync "all modified
  476          * pages with the region containing addr".  Unfortunately, we don't
  477          * really keep track of individual mmaps so we approximate by flushing
  478          * the range of the map entry containing addr. This can be incorrect
  479          * if the region splits or is coalesced with a neighbor.
  480          */
  481         if (size == 0) {
  482                 vm_map_entry_t entry;
  483 
  484                 vm_map_lock_read(map);
  485                 rv = vm_map_lookup_entry(map, addr, &entry);
  486                 if (rv == FALSE) {
  487                         vm_map_unlock_read(map);
  488                         rv = KERN_INVALID_ADDRESS;
  489                         goto done;
  490                 }
  491                 addr = entry->start;
  492                 size = entry->end - entry->start;
  493                 vm_map_unlock_read(map);
  494         }
  495 
  496         /*
  497          * Clean the pages and interpret the return value.
  498          */
  499         rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  500                           (flags & MS_INVALIDATE) != 0);
  501 done:
  502         lwkt_reltoken(&map->token);
  503 
  504         switch (rv) {
  505         case KERN_SUCCESS:
  506                 break;
  507         case KERN_INVALID_ADDRESS:
  508                 return (EINVAL);        /* Sun returns ENOMEM? */
  509         case KERN_FAILURE:
  510                 return (EIO);
  511         default:
  512                 return (EINVAL);
  513         }
  514 
  515         return (0);
  516 }
  517 
  518 /*
  519  * munmap system call handler
  520  *
  521  * munmap_args(void *addr, size_t len)
  522  *
  523  * No requirements
  524  */
  525 int
  526 sys_munmap(struct munmap_args *uap)
  527 {
  528         struct proc *p = curproc;
  529         vm_offset_t addr;
  530         vm_offset_t tmpaddr;
  531         vm_size_t size, pageoff;
  532         vm_map_t map;
  533 
  534         addr = (vm_offset_t) uap->addr;
  535         size = uap->len;
  536 
  537         pageoff = (addr & PAGE_MASK);
  538         addr -= pageoff;
  539         size += pageoff;
  540         size = (vm_size_t) round_page(size);
  541         if (size < uap->len)            /* wrap */
  542                 return(EINVAL);
  543         tmpaddr = addr + size;          /* workaround gcc4 opt */
  544         if (tmpaddr < addr)             /* wrap */
  545                 return(EINVAL);
  546 
  547         if (size == 0)
  548                 return (0);
  549 
  550         /*
  551          * Check for illegal addresses.  Watch out for address wrap... Note
  552          * that VM_*_ADDRESS are not constants due to casts (argh).
  553          */
  554         if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
  555                 return (EINVAL);
  556         if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS)
  557                 return (EINVAL);
  558 
  559         map = &p->p_vmspace->vm_map;
  560 
  561         /* map->token serializes between the map check and the actual unmap */
  562         lwkt_gettoken(&map->token);
  563 
  564         /*
  565          * Make sure entire range is allocated.
  566          */
  567         if (!vm_map_check_protection(map, addr, addr + size,
  568                                      VM_PROT_NONE, FALSE)) {
  569                 lwkt_reltoken(&map->token);
  570                 return (EINVAL);
  571         }
  572         /* returns nothing but KERN_SUCCESS anyway */
  573         vm_map_remove(map, addr, addr + size);
  574         lwkt_reltoken(&map->token);
  575         return (0);
  576 }
  577 
  578 /*
  579  * mprotect_args(const void *addr, size_t len, int prot)
  580  *
  581  * No requirements.
  582  */
  583 int
  584 sys_mprotect(struct mprotect_args *uap)
  585 {
  586         struct proc *p = curproc;
  587         vm_offset_t addr;
  588         vm_offset_t tmpaddr;
  589         vm_size_t size, pageoff;
  590         vm_prot_t prot;
  591         int error;
  592 
  593         addr = (vm_offset_t) uap->addr;
  594         size = uap->len;
  595         prot = uap->prot & VM_PROT_ALL;
  596 #if defined(VM_PROT_READ_IS_EXEC)
  597         if (prot & VM_PROT_READ)
  598                 prot |= VM_PROT_EXECUTE;
  599 #endif
  600 
  601         pageoff = (addr & PAGE_MASK);
  602         addr -= pageoff;
  603         size += pageoff;
  604         size = (vm_size_t) round_page(size);
  605         if (size < uap->len)            /* wrap */
  606                 return(EINVAL);
  607         tmpaddr = addr + size;          /* workaround gcc4 opt */
  608         if (tmpaddr < addr)             /* wrap */
  609                 return(EINVAL);
  610 
  611         switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size,
  612                                prot, FALSE)) {
  613         case KERN_SUCCESS:
  614                 error = 0;
  615                 break;
  616         case KERN_PROTECTION_FAILURE:
  617                 error = EACCES;
  618                 break;
  619         default:
  620                 error = EINVAL;
  621                 break;
  622         }
  623         return (error);
  624 }
  625 
  626 /*
  627  * minherit system call handler
  628  *
  629  * minherit_args(void *addr, size_t len, int inherit)
  630  *
  631  * No requirements.
  632  */
  633 int
  634 sys_minherit(struct minherit_args *uap)
  635 {
  636         struct proc *p = curproc;
  637         vm_offset_t addr;
  638         vm_offset_t tmpaddr;
  639         vm_size_t size, pageoff;
  640         vm_inherit_t inherit;
  641         int error;
  642 
  643         addr = (vm_offset_t)uap->addr;
  644         size = uap->len;
  645         inherit = uap->inherit;
  646 
  647         pageoff = (addr & PAGE_MASK);
  648         addr -= pageoff;
  649         size += pageoff;
  650         size = (vm_size_t) round_page(size);
  651         if (size < uap->len)            /* wrap */
  652                 return(EINVAL);
  653         tmpaddr = addr + size;          /* workaround gcc4 opt */
  654         if (tmpaddr < addr)             /* wrap */
  655                 return(EINVAL);
  656 
  657         switch (vm_map_inherit(&p->p_vmspace->vm_map, addr,
  658                                addr + size, inherit)) {
  659         case KERN_SUCCESS:
  660                 error = 0;
  661                 break;
  662         case KERN_PROTECTION_FAILURE:
  663                 error = EACCES;
  664                 break;
  665         default:
  666                 error = EINVAL;
  667                 break;
  668         }
  669         return (error);
  670 }
  671 
  672 /*
  673  * madvise system call handler
  674  * 
  675  * madvise_args(void *addr, size_t len, int behav)
  676  *
  677  * No requirements.
  678  */
  679 int
  680 sys_madvise(struct madvise_args *uap)
  681 {
  682         struct proc *p = curproc;
  683         vm_offset_t start, end;
  684         vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
  685         int error;
  686 
  687         /*
  688          * Check for illegal behavior
  689          */
  690         if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END)
  691                 return (EINVAL);
  692         /*
  693          * Check for illegal addresses.  Watch out for address wrap... Note
  694          * that VM_*_ADDRESS are not constants due to casts (argh).
  695          */
  696         if (tmpaddr < (vm_offset_t)uap->addr)
  697                 return (EINVAL);
  698         if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
  699                 return (EINVAL);
  700         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS)
  701                 return (EINVAL);
  702 
  703         /*
  704          * Since this routine is only advisory, we default to conservative
  705          * behavior.
  706          */
  707         start = trunc_page((vm_offset_t)uap->addr);
  708         end = round_page(tmpaddr);
  709 
  710         error = vm_map_madvise(&p->p_vmspace->vm_map, start, end,
  711                                uap->behav, 0);
  712         return (error);
  713 }
  714 
  715 /*
  716  * mcontrol system call handler
  717  *
  718  * mcontrol_args(void *addr, size_t len, int behav, off_t value)
  719  *
  720  * No requirements
  721  */
  722 int
  723 sys_mcontrol(struct mcontrol_args *uap)
  724 {
  725         struct proc *p = curproc;
  726         vm_offset_t start, end;
  727         vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
  728         int error;
  729 
  730         /*
  731          * Check for illegal behavior
  732          */
  733         if (uap->behav < 0 || uap->behav > MADV_CONTROL_END)
  734                 return (EINVAL);
  735         /*
  736          * Check for illegal addresses.  Watch out for address wrap... Note
  737          * that VM_*_ADDRESS are not constants due to casts (argh).
  738          */
  739         if (tmpaddr < (vm_offset_t) uap->addr)
  740                 return (EINVAL);
  741         if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS)
  742                 return (EINVAL);
  743         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS)
  744                 return (EINVAL);
  745 
  746         /*
  747          * Since this routine is only advisory, we default to conservative
  748          * behavior.
  749          */
  750         start = trunc_page((vm_offset_t)uap->addr);
  751         end = round_page(tmpaddr);
  752         
  753         error = vm_map_madvise(&p->p_vmspace->vm_map, start, end,
  754                                uap->behav, uap->value);
  755         return (error);
  756 }
  757 
  758 
  759 /*
  760  * mincore system call handler
  761  *
  762  * mincore_args(const void *addr, size_t len, char *vec)
  763  *
  764  * No requirements
  765  */
  766 int
  767 sys_mincore(struct mincore_args *uap)
  768 {
  769         struct proc *p = curproc;
  770         vm_offset_t addr, first_addr;
  771         vm_offset_t end, cend;
  772         pmap_t pmap;
  773         vm_map_t map;
  774         char *vec;
  775         int error;
  776         int vecindex, lastvecindex;
  777         vm_map_entry_t current;
  778         vm_map_entry_t entry;
  779         int mincoreinfo;
  780         unsigned int timestamp;
  781 
  782         /*
  783          * Make sure that the addresses presented are valid for user
  784          * mode.
  785          */
  786         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  787         end = addr + (vm_size_t)round_page(uap->len);
  788         if (end < addr)
  789                 return (EINVAL);
  790         if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS)
  791                 return (EINVAL);
  792 
  793         /*
  794          * Address of byte vector
  795          */
  796         vec = uap->vec;
  797 
  798         map = &p->p_vmspace->vm_map;
  799         pmap = vmspace_pmap(p->p_vmspace);
  800 
  801         lwkt_gettoken(&map->token);
  802         vm_map_lock_read(map);
  803 RestartScan:
  804         timestamp = map->timestamp;
  805 
  806         if (!vm_map_lookup_entry(map, addr, &entry))
  807                 entry = entry->next;
  808 
  809         /*
  810          * Do this on a map entry basis so that if the pages are not
  811          * in the current processes address space, we can easily look
  812          * up the pages elsewhere.
  813          */
  814         lastvecindex = -1;
  815         for(current = entry;
  816                 (current != &map->header) && (current->start < end);
  817                 current = current->next) {
  818 
  819                 /*
  820                  * ignore submaps (for now) or null objects
  821                  */
  822                 if (current->maptype != VM_MAPTYPE_NORMAL &&
  823                     current->maptype != VM_MAPTYPE_VPAGETABLE) {
  824                         continue;
  825                 }
  826                 if (current->object.vm_object == NULL)
  827                         continue;
  828                 
  829                 /*
  830                  * limit this scan to the current map entry and the
  831                  * limits for the mincore call
  832                  */
  833                 if (addr < current->start)
  834                         addr = current->start;
  835                 cend = current->end;
  836                 if (cend > end)
  837                         cend = end;
  838 
  839                 /*
  840                  * scan this entry one page at a time
  841                  */
  842                 while (addr < cend) {
  843                         /*
  844                          * Check pmap first, it is likely faster, also
  845                          * it can provide info as to whether we are the
  846                          * one referencing or modifying the page.
  847                          *
  848                          * If we have to check the VM object, only mess
  849                          * around with normal maps.  Do not mess around
  850                          * with virtual page tables (XXX).
  851                          */
  852                         mincoreinfo = pmap_mincore(pmap, addr);
  853                         if (mincoreinfo == 0 &&
  854                             current->maptype == VM_MAPTYPE_NORMAL) {
  855                                 vm_pindex_t pindex;
  856                                 vm_ooffset_t offset;
  857                                 vm_page_t m;
  858 
  859                                 /*
  860                                  * calculate the page index into the object
  861                                  */
  862                                 offset = current->offset + (addr - current->start);
  863                                 pindex = OFF_TO_IDX(offset);
  864 
  865                                 /*
  866                                  * if the page is resident, then gather 
  867                                  * information about it.  spl protection is
  868                                  * required to maintain the object 
  869                                  * association.  And XXX what if the page is
  870                                  * busy?  What's the deal with that?
  871                                  *
  872                                  * XXX vm_token - legacy for pmap_ts_referenced
  873                                  *     in i386 and vkernel pmap code.
  874                                  */
  875                                 lwkt_gettoken(&vm_token);
  876                                 vm_object_hold(current->object.vm_object);
  877                                 m = vm_page_lookup(current->object.vm_object,
  878                                                     pindex);
  879                                 if (m && m->valid) {
  880                                         mincoreinfo = MINCORE_INCORE;
  881                                         if (m->dirty ||
  882                                                 pmap_is_modified(m))
  883                                                 mincoreinfo |= MINCORE_MODIFIED_OTHER;
  884                                         if ((m->flags & PG_REFERENCED) ||
  885                                                 pmap_ts_referenced(m)) {
  886                                                 vm_page_flag_set(m, PG_REFERENCED);
  887                                                 mincoreinfo |= MINCORE_REFERENCED_OTHER;
  888                                         }
  889                                 }
  890                                 vm_object_drop(current->object.vm_object);
  891                                 lwkt_reltoken(&vm_token);
  892                         }
  893 
  894                         /*
  895                          * subyte may page fault.  In case it needs to modify
  896                          * the map, we release the lock.
  897                          */
  898                         vm_map_unlock_read(map);
  899 
  900                         /*
  901                          * calculate index into user supplied byte vector
  902                          */
  903                         vecindex = OFF_TO_IDX(addr - first_addr);
  904 
  905                         /*
  906                          * If we have skipped map entries, we need to make sure that
  907                          * the byte vector is zeroed for those skipped entries.
  908                          */
  909                         while((lastvecindex + 1) < vecindex) {
  910                                 error = subyte( vec + lastvecindex, 0);
  911                                 if (error) {
  912                                         error = EFAULT;
  913                                         goto done;
  914                                 }
  915                                 ++lastvecindex;
  916                         }
  917 
  918                         /*
  919                          * Pass the page information to the user
  920                          */
  921                         error = subyte( vec + vecindex, mincoreinfo);
  922                         if (error) {
  923                                 error = EFAULT;
  924                                 goto done;
  925                         }
  926 
  927                         /*
  928                          * If the map has changed, due to the subyte, the previous
  929                          * output may be invalid.
  930                          */
  931                         vm_map_lock_read(map);
  932                         if (timestamp != map->timestamp)
  933                                 goto RestartScan;
  934 
  935                         lastvecindex = vecindex;
  936                         addr += PAGE_SIZE;
  937                 }
  938         }
  939 
  940         /*
  941          * subyte may page fault.  In case it needs to modify
  942          * the map, we release the lock.
  943          */
  944         vm_map_unlock_read(map);
  945 
  946         /*
  947          * Zero the last entries in the byte vector.
  948          */
  949         vecindex = OFF_TO_IDX(end - first_addr);
  950         while((lastvecindex + 1) < vecindex) {
  951                 error = subyte( vec + lastvecindex, 0);
  952                 if (error) {
  953                         error = EFAULT;
  954                         goto done;
  955                 }
  956                 ++lastvecindex;
  957         }
  958         
  959         /*
  960          * If the map has changed, due to the subyte, the previous
  961          * output may be invalid.
  962          */
  963         vm_map_lock_read(map);
  964         if (timestamp != map->timestamp)
  965                 goto RestartScan;
  966         vm_map_unlock_read(map);
  967 
  968         error = 0;
  969 done:
  970         lwkt_reltoken(&map->token);
  971         return (error);
  972 }
  973 
  974 /*
  975  * mlock system call handler
  976  *
  977  * mlock_args(const void *addr, size_t len)
  978  *
  979  * No requirements
  980  */
  981 int
  982 sys_mlock(struct mlock_args *uap)
  983 {
  984         vm_offset_t addr;
  985         vm_offset_t tmpaddr;
  986         vm_size_t size, pageoff;
  987         struct thread *td = curthread;
  988         struct proc *p = td->td_proc;
  989         int error;
  990 
  991         addr = (vm_offset_t) uap->addr;
  992         size = uap->len;
  993 
  994         pageoff = (addr & PAGE_MASK);
  995         addr -= pageoff;
  996         size += pageoff;
  997         size = (vm_size_t) round_page(size);
  998         if (size < uap->len)            /* wrap */
  999                 return(EINVAL);
 1000         tmpaddr = addr + size;          /* workaround gcc4 opt */
 1001         if (tmpaddr < addr)             /* wrap */
 1002                 return (EINVAL);
 1003 
 1004         if (atop(size) + vmstats.v_wire_count > vm_page_max_wired)
 1005                 return (EAGAIN);
 1006 
 1007         /* 
 1008          * We do not need to synchronize against other threads updating ucred;
 1009          * they update p->ucred, which is synchronized into td_ucred ourselves.
 1010          */
 1011 #ifdef pmap_wired_count
 1012         if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
 1013             p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) {
 1014                 return (ENOMEM);
 1015         }
 1016 #else
 1017         error = priv_check_cred(td->td_ucred, PRIV_ROOT, 0);
 1018         if (error) {
 1019                 return (error);
 1020         }
 1021 #endif
 1022         error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE);
 1023         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1024 }
 1025 
 1026 /*
 1027  * mlockall(int how)
 1028  *
 1029  * No requirements
 1030  */
 1031 int
 1032 sys_mlockall(struct mlockall_args *uap)
 1033 {
 1034 #ifdef _P1003_1B_VISIBLE
 1035         struct thread *td = curthread;
 1036         struct proc *p = td->td_proc;
 1037         vm_map_t map = &p->p_vmspace->vm_map;
 1038         vm_map_entry_t entry;
 1039         int how = uap->how;
 1040         int rc = KERN_SUCCESS;
 1041 
 1042         if (((how & MCL_CURRENT) == 0) && ((how & MCL_FUTURE) == 0))
 1043                 return (EINVAL);
 1044 
 1045         rc = priv_check_cred(td->td_ucred, PRIV_ROOT, 0);
 1046         if (rc) 
 1047                 return (rc);
 1048 
 1049         vm_map_lock(map);
 1050         do {
 1051                 if (how & MCL_CURRENT) {
 1052                         for(entry = map->header.next;
 1053                             entry != &map->header;
 1054                             entry = entry->next);
 1055 
 1056                         rc = ENOSYS;
 1057                         break;
 1058                 }
 1059         
 1060                 if (how & MCL_FUTURE)
 1061                         map->flags |= MAP_WIREFUTURE;
 1062         } while(0);
 1063         vm_map_unlock(map);
 1064 
 1065         return (rc);
 1066 #else /* !_P1003_1B_VISIBLE */
 1067         return (ENOSYS);
 1068 #endif /* _P1003_1B_VISIBLE */
 1069 }
 1070 
 1071 /*
 1072  * munlockall(void)
 1073  *
 1074  *      Unwire all user-wired map entries, cancel MCL_FUTURE.
 1075  *
 1076  * No requirements
 1077  */
 1078 int
 1079 sys_munlockall(struct munlockall_args *uap)
 1080 {
 1081         struct thread *td = curthread;
 1082         struct proc *p = td->td_proc;
 1083         vm_map_t map = &p->p_vmspace->vm_map;
 1084         vm_map_entry_t entry;
 1085         int rc = KERN_SUCCESS;
 1086 
 1087         vm_map_lock(map);
 1088 
 1089         /* Clear MAP_WIREFUTURE to cancel mlockall(MCL_FUTURE) */
 1090         map->flags &= ~MAP_WIREFUTURE;
 1091 
 1092 retry:
 1093         for (entry = map->header.next;
 1094              entry != &map->header;
 1095              entry = entry->next) {
 1096                 if ((entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
 1097                         continue;
 1098 
 1099                 /*
 1100                  * If we encounter an in-transition entry, we release the 
 1101                  * map lock and retry the scan; we do not decrement any
 1102                  * wired_count more than once because we do not touch
 1103                  * any entries with MAP_ENTRY_USER_WIRED not set.
 1104                  *
 1105                  * There is a potential interleaving with concurrent
 1106                  * mlockall()s here -- if we abort a scan, an mlockall()
 1107                  * could start, wire a number of entries before our 
 1108                  * current position in, and then stall itself on this
 1109                  * or any other in-transition entry. If that occurs, when
 1110                  * we resume, we will unwire those entries. 
 1111                  */
 1112                 if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 1113                         entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 1114                         ++mycpu->gd_cnt.v_intrans_coll;
 1115                         ++mycpu->gd_cnt.v_intrans_wait;
 1116                         vm_map_transition_wait(map);
 1117                         goto retry;
 1118                 }
 1119 
 1120                 KASSERT(entry->wired_count > 0, 
 1121                         ("wired_count was 0 with USER_WIRED set! %p", entry));
 1122         
 1123                 /* Drop wired count, if it hits zero, unwire the entry */
 1124                 entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 1125                 entry->wired_count--;
 1126                 if (entry->wired_count == 0)
 1127                         vm_fault_unwire(map, entry);
 1128         }
 1129 
 1130         map->timestamp++;
 1131         vm_map_unlock(map);
 1132 
 1133         return (rc);
 1134 }
 1135 
 1136 /*
 1137  * munlock system call handler
 1138  *
 1139  * munlock_args(const void *addr, size_t len)
 1140  *
 1141  * No requirements
 1142  */
 1143 int
 1144 sys_munlock(struct munlock_args *uap)
 1145 {
 1146         struct thread *td = curthread;
 1147         struct proc *p = td->td_proc;
 1148         vm_offset_t addr;
 1149         vm_offset_t tmpaddr;
 1150         vm_size_t size, pageoff;
 1151         int error;
 1152 
 1153         addr = (vm_offset_t) uap->addr;
 1154         size = uap->len;
 1155 
 1156         pageoff = (addr & PAGE_MASK);
 1157         addr -= pageoff;
 1158         size += pageoff;
 1159         size = (vm_size_t) round_page(size);
 1160 
 1161         tmpaddr = addr + size;
 1162         if (tmpaddr < addr)             /* wrap */
 1163                 return (EINVAL);
 1164 
 1165 #ifndef pmap_wired_count
 1166         error = priv_check(td, PRIV_ROOT);
 1167         if (error)
 1168                 return (error);
 1169 #endif
 1170 
 1171         error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE);
 1172         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1173 }
 1174 
 1175 /*
 1176  * Internal version of mmap.
 1177  * Currently used by mmap, exec, and sys5 shared memory.
 1178  * Handle is either a vnode pointer or NULL for MAP_ANON.
 1179  * 
 1180  * No requirements
 1181  */
 1182 int
 1183 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1184         vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff)
 1185 {
 1186         boolean_t fitit;
 1187         vm_object_t object;
 1188         vm_offset_t eaddr;
 1189         vm_size_t   esize;
 1190         vm_size_t   align;
 1191         struct vnode *vp;
 1192         struct thread *td = curthread;
 1193         struct proc *p;
 1194         int rv = KERN_SUCCESS;
 1195         off_t objsize;
 1196         int docow;
 1197         int error;
 1198 
 1199         if (size == 0)
 1200                 return (0);
 1201 
 1202         objsize = round_page(size);
 1203         if (objsize < size)
 1204                 return (EINVAL);
 1205         size = objsize;
 1206 
 1207         lwkt_gettoken(&map->token);
 1208         
 1209         /*
 1210          * XXX messy code, fixme
 1211          *
 1212          * NOTE: Overflow checks require discrete statements or GCC4
 1213          * will optimize it out.
 1214          */
 1215         if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) {
 1216                 esize = map->size + size;       /* workaround gcc4 opt */
 1217                 if (esize < map->size ||
 1218                     esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) {
 1219                         lwkt_reltoken(&map->token);
 1220                         return(ENOMEM);
 1221                 }
 1222         }
 1223 
 1224         /*
 1225          * We currently can only deal with page aligned file offsets.
 1226          * The check is here rather than in the syscall because the
 1227          * kernel calls this function internally for other mmaping
 1228          * operations (such as in exec) and non-aligned offsets will
 1229          * cause pmap inconsistencies...so we want to be sure to
 1230          * disallow this in all cases.
 1231          *
 1232          * NOTE: Overflow checks require discrete statements or GCC4
 1233          * will optimize it out.
 1234          */
 1235         if (foff & PAGE_MASK) {
 1236                 lwkt_reltoken(&map->token);
 1237                 return (EINVAL);
 1238         }
 1239 
 1240         /*
 1241          * Handle alignment.  For large memory maps it is possible
 1242          * that the MMU can optimize the page table so align anything
 1243          * that is a multiple of SEG_SIZE to SEG_SIZE.
 1244          *
 1245          * Also align any large mapping (bigger than 16x SG_SIZE) to a
 1246          * SEG_SIZE address boundary.
 1247          */
 1248         if (flags & MAP_SIZEALIGN) {
 1249                 align = size;
 1250                 if ((align ^ (align - 1)) != (align << 1) - 1) {
 1251                         lwkt_reltoken(&map->token);
 1252                         return (EINVAL);
 1253                 }
 1254         } else if ((flags & MAP_FIXED) == 0 &&
 1255                    ((size & SEG_MASK) == 0 || size > SEG_SIZE * 16)) {
 1256                 align = SEG_SIZE;
 1257         } else {
 1258                 align = PAGE_SIZE;
 1259         }
 1260 
 1261         if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) {
 1262                 fitit = TRUE;
 1263                 *addr = round_page(*addr);
 1264         } else {
 1265                 if (*addr != trunc_page(*addr)) {
 1266                         lwkt_reltoken(&map->token);
 1267                         return (EINVAL);
 1268                 }
 1269                 eaddr = *addr + size;
 1270                 if (eaddr < *addr) {
 1271                         lwkt_reltoken(&map->token);
 1272                         return (EINVAL);
 1273                 }
 1274                 fitit = FALSE;
 1275                 if ((flags & MAP_TRYFIXED) == 0)
 1276                         vm_map_remove(map, *addr, *addr + size);
 1277         }
 1278 
 1279         /*
 1280          * Lookup/allocate object.
 1281          */
 1282         if (flags & MAP_ANON) {
 1283                 /*
 1284                  * Unnamed anonymous regions always start at 0.
 1285                  */
 1286                 if (handle) {
 1287                         /*
 1288                          * Default memory object
 1289                          */
 1290                         object = default_pager_alloc(handle, objsize,
 1291                                                      prot, foff);
 1292                         if (object == NULL) {
 1293                                 lwkt_reltoken(&map->token);
 1294                                 return(ENOMEM);
 1295                         }
 1296                         docow = MAP_PREFAULT_PARTIAL;
 1297                 } else {
 1298                         /*
 1299                          * Implicit single instance of a default memory
 1300                          * object, so we don't need a VM object yet.
 1301                          */
 1302                         foff = 0;
 1303                         object = NULL;
 1304                         docow = 0;
 1305                 }
 1306                 vp = NULL;
 1307         } else {
 1308                 vp = (struct vnode *)handle;
 1309                 if (vp->v_type == VCHR) {
 1310                         /*
 1311                          * Device mappings (device size unknown?).
 1312                          * Force them to be shared.
 1313                          */
 1314                         error = dev_dmmap_single(vp->v_rdev, &foff, objsize,
 1315                                                 &object, prot);
 1316 
 1317                         if (error == ENODEV) {
 1318                                 handle = (void *)(intptr_t)vp->v_rdev;
 1319                                 object = dev_pager_alloc(handle, objsize, prot, foff);
 1320                                 if (object == NULL) {
 1321                                         lwkt_reltoken(&map->token);
 1322                                         return(EINVAL);
 1323                                 }
 1324                         }
 1325                         docow = MAP_PREFAULT_PARTIAL;
 1326                         flags &= ~(MAP_PRIVATE|MAP_COPY);
 1327                         flags |= MAP_SHARED;
 1328                 } else {
 1329                         /*
 1330                          * Regular file mapping (typically).  The attribute
 1331                          * check is for the link count test only.  Mmapble
 1332                          * vnodes must already have a VM object assigned.
 1333                          */
 1334                         struct vattr vat;
 1335                         int error;
 1336 
 1337                         error = VOP_GETATTR(vp, &vat);
 1338                         if (error) {
 1339                                 lwkt_reltoken(&map->token);
 1340                                 return (error);
 1341                         }
 1342                         docow = MAP_PREFAULT_PARTIAL;
 1343                         object = vnode_pager_reference(vp);
 1344                         if (object == NULL && vp->v_type == VREG) {
 1345                                 lwkt_reltoken(&map->token);
 1346                                 kprintf("Warning: cannot mmap vnode %p, no "
 1347                                         "object\n", vp);
 1348                                 return(EINVAL);
 1349                         }
 1350 
 1351                         /*
 1352                          * If it is a regular file without any references
 1353                          * we do not need to sync it.
 1354                          */
 1355                         if (vp->v_type == VREG && vat.va_nlink == 0) {
 1356                                 flags |= MAP_NOSYNC;
 1357                         }
 1358                 }
 1359         }
 1360 
 1361         /*
 1362          * Deal with the adjusted flags
 1363          */
 1364         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1365                 docow |= MAP_COPY_ON_WRITE;
 1366         if (flags & MAP_NOSYNC)
 1367                 docow |= MAP_DISABLE_SYNCER;
 1368         if (flags & MAP_NOCORE)
 1369                 docow |= MAP_DISABLE_COREDUMP;
 1370 
 1371 #if defined(VM_PROT_READ_IS_EXEC)
 1372         if (prot & VM_PROT_READ)
 1373                 prot |= VM_PROT_EXECUTE;
 1374 
 1375         if (maxprot & VM_PROT_READ)
 1376                 maxprot |= VM_PROT_EXECUTE;
 1377 #endif
 1378 
 1379         /*
 1380          * This may place the area in its own page directory if (size) is
 1381          * large enough, otherwise it typically returns its argument.
 1382          */
 1383         if (fitit) {
 1384                 *addr = pmap_addr_hint(object, *addr, size);
 1385         }
 1386 
 1387         /*
 1388          * Stack mappings need special attention.
 1389          *
 1390          * Mappings that use virtual page tables will default to storing
 1391          * the page table at offset 0.
 1392          */
 1393         if (flags & MAP_STACK) {
 1394                 rv = vm_map_stack(map, *addr, size, flags,
 1395                                   prot, maxprot, docow);
 1396         } else if (flags & MAP_VPAGETABLE) {
 1397                 rv = vm_map_find(map, object, foff, addr, size, align,
 1398                                  fitit, VM_MAPTYPE_VPAGETABLE,
 1399                                  prot, maxprot, docow);
 1400         } else {
 1401                 rv = vm_map_find(map, object, foff, addr, size, align,
 1402                                  fitit, VM_MAPTYPE_NORMAL,
 1403                                  prot, maxprot, docow);
 1404         }
 1405 
 1406         if (rv != KERN_SUCCESS) {
 1407                 /*
 1408                  * Lose the object reference. Will destroy the
 1409                  * object if it's an unnamed anonymous mapping
 1410                  * or named anonymous without other references.
 1411                  */
 1412                 vm_object_deallocate(object);
 1413                 goto out;
 1414         }
 1415 
 1416         /*
 1417          * Shared memory is also shared with children.
 1418          */
 1419         if (flags & (MAP_SHARED|MAP_INHERIT)) {
 1420                 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 1421                 if (rv != KERN_SUCCESS) {
 1422                         vm_map_remove(map, *addr, *addr + size);
 1423                         goto out;
 1424                 }
 1425         }
 1426 
 1427         /* If a process has marked all future mappings for wiring, do so */
 1428         if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
 1429                 vm_map_unwire(map, *addr, *addr + size, FALSE);
 1430 
 1431         /*
 1432          * Set the access time on the vnode
 1433          */
 1434         if (vp != NULL)
 1435                 vn_mark_atime(vp, td);
 1436 out:
 1437         lwkt_reltoken(&map->token);
 1438         
 1439         switch (rv) {
 1440         case KERN_SUCCESS:
 1441                 return (0);
 1442         case KERN_INVALID_ADDRESS:
 1443         case KERN_NO_SPACE:
 1444                 return (ENOMEM);
 1445         case KERN_PROTECTION_FAILURE:
 1446                 return (EACCES);
 1447         default:
 1448                 return (EINVAL);
 1449         }
 1450 }
 1451 
 1452 /*
 1453  * Translate a Mach VM return code to zero on success or the appropriate errno
 1454  * on failure.
 1455  */
 1456 int
 1457 vm_mmap_to_errno(int rv)
 1458 {
 1459 
 1460         switch (rv) {
 1461         case KERN_SUCCESS:
 1462                 return (0);
 1463         case KERN_INVALID_ADDRESS:
 1464         case KERN_NO_SPACE:
 1465                 return (ENOMEM);
 1466         case KERN_PROTECTION_FAILURE:
 1467                 return (EACCES);
 1468         default:
 1469                 return (EINVAL);
 1470         }
 1471 }

Cache object: 36dc965803fe190d19f4a881afecc826


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.