The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vm/vm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1988 University of Utah.
    3  * Copyright (c) 1991, 1993
    4  *      The Regents of the University of California.  All rights reserved.
    5  *
    6  * This code is derived from software contributed to Berkeley by
    7  * the Systems Programming Group of the University of Utah Computer
    8  * Science Department.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   35  *
   36  *      @(#)vm_mmap.c   8.4 (Berkeley) 1/12/94
   37  */
   38 
   39 /*
   40  * Mapped file (mmap) interface to VM
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD$");
   45 
   46 #include "opt_compat.h"
   47 #include "opt_mac.h"
   48 
   49 #include <sys/param.h>
   50 #include <sys/systm.h>
   51 #include <sys/kernel.h>
   52 #include <sys/lock.h>
   53 #include <sys/mutex.h>
   54 #include <sys/sysproto.h>
   55 #include <sys/filedesc.h>
   56 #include <sys/proc.h>
   57 #include <sys/resource.h>
   58 #include <sys/resourcevar.h>
   59 #include <sys/vnode.h>
   60 #include <sys/fcntl.h>
   61 #include <sys/file.h>
   62 #include <sys/mac.h>
   63 #include <sys/mman.h>
   64 #include <sys/mount.h>
   65 #include <sys/conf.h>
   66 #include <sys/stat.h>
   67 #include <sys/vmmeter.h>
   68 #include <sys/sysctl.h>
   69 
   70 #include <vm/vm.h>
   71 #include <vm/vm_param.h>
   72 #include <vm/pmap.h>
   73 #include <vm/vm_map.h>
   74 #include <vm/vm_object.h>
   75 #include <vm/vm_page.h>
   76 #include <vm/vm_pager.h>
   77 #include <vm/vm_pageout.h>
   78 #include <vm/vm_extern.h>
   79 #include <vm/vm_page.h>
   80 #include <vm/vm_kern.h>
   81 
   82 #ifndef _SYS_SYSPROTO_H_
   83 struct sbrk_args {
   84         int incr;
   85 };
   86 #endif
   87 
   88 static int max_proc_mmap;
   89 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
   90 
   91 /*
   92  * Set the maximum number of vm_map_entry structures per process.  Roughly
   93  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
   94  * of our KVM malloc space still results in generous limits.  We want a
   95  * default that is good enough to prevent the kernel running out of resources
   96  * if attacked from compromised user account but generous enough such that
   97  * multi-threaded processes are not unduly inconvenienced.
   98  */
   99 static void vmmapentry_rsrc_init(void *);
  100 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
  101 
  102 static void
  103 vmmapentry_rsrc_init(dummy)
  104         void *dummy;
  105 {
  106     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
  107     max_proc_mmap /= 100;
  108 }
  109 
  110 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
  111     int *, struct vnode *, vm_ooffset_t, vm_object_t *);
  112 
  113 /*
  114  * MPSAFE
  115  */
  116 /* ARGSUSED */
  117 int
  118 sbrk(td, uap)
  119         struct thread *td;
  120         struct sbrk_args *uap;
  121 {
  122         /* Not yet implemented */
  123         return (EOPNOTSUPP);
  124 }
  125 
  126 #ifndef _SYS_SYSPROTO_H_
  127 struct sstk_args {
  128         int incr;
  129 };
  130 #endif
  131 
  132 /*
  133  * MPSAFE
  134  */
  135 /* ARGSUSED */
  136 int
  137 sstk(td, uap)
  138         struct thread *td;
  139         struct sstk_args *uap;
  140 {
  141         /* Not yet implemented */
  142         return (EOPNOTSUPP);
  143 }
  144 
  145 #if defined(COMPAT_43)
  146 #ifndef _SYS_SYSPROTO_H_
  147 struct getpagesize_args {
  148         int dummy;
  149 };
  150 #endif
  151 
  152 /* ARGSUSED */
  153 int
  154 ogetpagesize(td, uap)
  155         struct thread *td;
  156         struct getpagesize_args *uap;
  157 {
  158         /* MP SAFE */
  159         td->td_retval[0] = PAGE_SIZE;
  160         return (0);
  161 }
  162 #endif                          /* COMPAT_43 */
  163 
  164 
  165 /*
  166  * Memory Map (mmap) system call.  Note that the file offset
  167  * and address are allowed to be NOT page aligned, though if
  168  * the MAP_FIXED flag it set, both must have the same remainder
  169  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  170  * page-aligned, the actual mapping starts at trunc_page(addr)
  171  * and the return value is adjusted up by the page offset.
  172  *
  173  * Generally speaking, only character devices which are themselves
  174  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  175  * there would be no cache coherency between a descriptor and a VM mapping
  176  * both to the same character device.
  177  *
  178  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  179  * is maintained as long as you do not write directly to the underlying
  180  * character device.
  181  */
  182 #ifndef _SYS_SYSPROTO_H_
  183 struct mmap_args {
  184         void *addr;
  185         size_t len;
  186         int prot;
  187         int flags;
  188         int fd;
  189         long pad;
  190         off_t pos;
  191 };
  192 #endif
  193 
  194 /*
  195  * MPSAFE
  196  */
  197 int
  198 mmap(td, uap)
  199         struct thread *td;
  200         struct mmap_args *uap;
  201 {
  202         struct file *fp;
  203         struct vnode *vp;
  204         vm_offset_t addr;
  205         vm_size_t size, pageoff;
  206         vm_prot_t prot, maxprot;
  207         void *handle;
  208         int flags, error;
  209         off_t pos;
  210         struct vmspace *vms = td->td_proc->p_vmspace;
  211 
  212         addr = (vm_offset_t) uap->addr;
  213         size = uap->len;
  214         prot = uap->prot & VM_PROT_ALL;
  215         flags = uap->flags;
  216         pos = uap->pos;
  217 
  218         fp = NULL;
  219         /* make sure mapping fits into numeric range etc */
  220         if ((ssize_t) uap->len < 0 ||
  221             ((flags & MAP_ANON) && uap->fd != -1))
  222                 return (EINVAL);
  223 
  224         if (flags & MAP_STACK) {
  225                 if ((uap->fd != -1) ||
  226                     ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
  227                         return (EINVAL);
  228                 flags |= MAP_ANON;
  229                 pos = 0;
  230         }
  231 
  232         /*
  233          * Align the file position to a page boundary,
  234          * and save its page offset component.
  235          */
  236         pageoff = (pos & PAGE_MASK);
  237         pos -= pageoff;
  238 
  239         /* Adjust size for rounding (on both ends). */
  240         size += pageoff;                        /* low end... */
  241         size = (vm_size_t) round_page(size);    /* hi end */
  242 
  243         /*
  244          * Check for illegal addresses.  Watch out for address wrap... Note
  245          * that VM_*_ADDRESS are not constants due to casts (argh).
  246          */
  247         if (flags & MAP_FIXED) {
  248                 /*
  249                  * The specified address must have the same remainder
  250                  * as the file offset taken modulo PAGE_SIZE, so it
  251                  * should be aligned after adjustment by pageoff.
  252                  */
  253                 addr -= pageoff;
  254                 if (addr & PAGE_MASK)
  255                         return (EINVAL);
  256                 /* Address range must be all in user VM space. */
  257                 if (addr < vm_map_min(&vms->vm_map) ||
  258                     addr + size > vm_map_max(&vms->vm_map))
  259                         return (EINVAL);
  260                 if (addr + size < addr)
  261                         return (EINVAL);
  262         } else {
  263         /*
  264          * XXX for non-fixed mappings where no hint is provided or
  265          * the hint would fall in the potential heap space,
  266          * place it after the end of the largest possible heap.
  267          *
  268          * There should really be a pmap call to determine a reasonable
  269          * location.
  270          */
  271                 PROC_LOCK(td->td_proc);
  272                 if (addr == 0 ||
  273                     (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
  274                     addr < round_page((vm_offset_t)vms->vm_daddr +
  275                     lim_max(td->td_proc, RLIMIT_DATA))))
  276                         addr = round_page((vm_offset_t)vms->vm_daddr +
  277                             lim_max(td->td_proc, RLIMIT_DATA));
  278                 PROC_UNLOCK(td->td_proc);
  279         }
  280         if (flags & MAP_ANON) {
  281                 /*
  282                  * Mapping blank space is trivial.
  283                  */
  284                 handle = NULL;
  285                 maxprot = VM_PROT_ALL;
  286                 pos = 0;
  287         } else {
  288                 /*
  289                  * Mapping file, get fp for validation. Obtain vnode and make
  290                  * sure it is of appropriate type.
  291                  * don't let the descriptor disappear on us if we block
  292                  */
  293                 if ((error = fget(td, uap->fd, &fp)) != 0)
  294                         goto done;
  295                 if (fp->f_type != DTYPE_VNODE) {
  296                         error = EINVAL;
  297                         goto done;
  298                 }
  299                 /*
  300                  * POSIX shared-memory objects are defined to have
  301                  * kernel persistence, and are not defined to support
  302                  * read(2)/write(2) -- or even open(2).  Thus, we can
  303                  * use MAP_ASYNC to trade on-disk coherence for speed.
  304                  * The shm_open(3) library routine turns on the FPOSIXSHM
  305                  * flag to request this behavior.
  306                  */
  307                 if (fp->f_flag & FPOSIXSHM)
  308                         flags |= MAP_NOSYNC;
  309                 vp = fp->f_vnode;
  310                 /*
  311                  * Ensure that file and memory protections are
  312                  * compatible.  Note that we only worry about
  313                  * writability if mapping is shared; in this case,
  314                  * current and max prot are dictated by the open file.
  315                  * XXX use the vnode instead?  Problem is: what
  316                  * credentials do we use for determination? What if
  317                  * proc does a setuid?
  318                  */
  319                 if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
  320                         maxprot = VM_PROT_NONE;
  321                 else
  322                         maxprot = VM_PROT_EXECUTE;
  323                 if (fp->f_flag & FREAD) {
  324                         maxprot |= VM_PROT_READ;
  325                 } else if (prot & PROT_READ) {
  326                         error = EACCES;
  327                         goto done;
  328                 }
  329                 /*
  330                  * If we are sharing potential changes (either via
  331                  * MAP_SHARED or via the implicit sharing of character
  332                  * device mappings), and we are trying to get write
  333                  * permission although we opened it without asking
  334                  * for it, bail out.
  335                  */
  336                 if ((flags & MAP_SHARED) != 0) {
  337                         if ((fp->f_flag & FWRITE) != 0) {
  338                                 maxprot |= VM_PROT_WRITE;
  339                         } else if ((prot & PROT_WRITE) != 0) {
  340                                 error = EACCES;
  341                                 goto done;
  342                         }
  343                 } else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
  344                         maxprot |= VM_PROT_WRITE;
  345                 }
  346                 handle = (void *)vp;
  347         }
  348 
  349         /*
  350          * Do not allow more then a certain number of vm_map_entry structures
  351          * per process.  Scale with the number of rforks sharing the map
  352          * to make the limit reasonable for threads.
  353          */
  354         if (max_proc_mmap &&
  355             vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
  356                 error = ENOMEM;
  357                 goto done;
  358         }
  359 
  360         error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
  361             flags, handle, pos);
  362         if (error == 0)
  363                 td->td_retval[0] = (register_t) (addr + pageoff);
  364 done:
  365         if (fp)
  366                 fdrop(fp, td);
  367 
  368         return (error);
  369 }
  370 
  371 #ifdef COMPAT_43
  372 #ifndef _SYS_SYSPROTO_H_
  373 struct ommap_args {
  374         caddr_t addr;
  375         int len;
  376         int prot;
  377         int flags;
  378         int fd;
  379         long pos;
  380 };
  381 #endif
  382 int
  383 ommap(td, uap)
  384         struct thread *td;
  385         struct ommap_args *uap;
  386 {
  387         struct mmap_args nargs;
  388         static const char cvtbsdprot[8] = {
  389                 0,
  390                 PROT_EXEC,
  391                 PROT_WRITE,
  392                 PROT_EXEC | PROT_WRITE,
  393                 PROT_READ,
  394                 PROT_EXEC | PROT_READ,
  395                 PROT_WRITE | PROT_READ,
  396                 PROT_EXEC | PROT_WRITE | PROT_READ,
  397         };
  398 
  399 #define OMAP_ANON       0x0002
  400 #define OMAP_COPY       0x0020
  401 #define OMAP_SHARED     0x0010
  402 #define OMAP_FIXED      0x0100
  403 
  404         nargs.addr = uap->addr;
  405         nargs.len = uap->len;
  406         nargs.prot = cvtbsdprot[uap->prot & 0x7];
  407         nargs.flags = 0;
  408         if (uap->flags & OMAP_ANON)
  409                 nargs.flags |= MAP_ANON;
  410         if (uap->flags & OMAP_COPY)
  411                 nargs.flags |= MAP_COPY;
  412         if (uap->flags & OMAP_SHARED)
  413                 nargs.flags |= MAP_SHARED;
  414         else
  415                 nargs.flags |= MAP_PRIVATE;
  416         if (uap->flags & OMAP_FIXED)
  417                 nargs.flags |= MAP_FIXED;
  418         nargs.fd = uap->fd;
  419         nargs.pos = uap->pos;
  420         return (mmap(td, &nargs));
  421 }
  422 #endif                          /* COMPAT_43 */
  423 
  424 
  425 #ifndef _SYS_SYSPROTO_H_
  426 struct msync_args {
  427         void *addr;
  428         int len;
  429         int flags;
  430 };
  431 #endif
  432 /*
  433  * MPSAFE
  434  */
  435 int
  436 msync(td, uap)
  437         struct thread *td;
  438         struct msync_args *uap;
  439 {
  440         vm_offset_t addr;
  441         vm_size_t size, pageoff;
  442         int flags;
  443         vm_map_t map;
  444         int rv;
  445 
  446         addr = (vm_offset_t) uap->addr;
  447         size = uap->len;
  448         flags = uap->flags;
  449 
  450         pageoff = (addr & PAGE_MASK);
  451         addr -= pageoff;
  452         size += pageoff;
  453         size = (vm_size_t) round_page(size);
  454         if (addr + size < addr)
  455                 return (EINVAL);
  456 
  457         if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
  458                 return (EINVAL);
  459 
  460         map = &td->td_proc->p_vmspace->vm_map;
  461 
  462         /*
  463          * Clean the pages and interpret the return value.
  464          */
  465         rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
  466             (flags & MS_INVALIDATE) != 0);
  467         switch (rv) {
  468         case KERN_SUCCESS:
  469                 return (0);
  470         case KERN_INVALID_ADDRESS:
  471                 return (EINVAL);        /* Sun returns ENOMEM? */
  472         case KERN_INVALID_ARGUMENT:
  473                 return (EBUSY);
  474         default:
  475                 return (EINVAL);
  476         }
  477 }
  478 
  479 #ifndef _SYS_SYSPROTO_H_
  480 struct munmap_args {
  481         void *addr;
  482         size_t len;
  483 };
  484 #endif
  485 /*
  486  * MPSAFE
  487  */
  488 int
  489 munmap(td, uap)
  490         struct thread *td;
  491         struct munmap_args *uap;
  492 {
  493         vm_offset_t addr;
  494         vm_size_t size, pageoff;
  495         vm_map_t map;
  496 
  497         addr = (vm_offset_t) uap->addr;
  498         size = uap->len;
  499         if (size == 0)
  500                 return (EINVAL);
  501 
  502         pageoff = (addr & PAGE_MASK);
  503         addr -= pageoff;
  504         size += pageoff;
  505         size = (vm_size_t) round_page(size);
  506         if (addr + size < addr)
  507                 return (EINVAL);
  508 
  509         /*
  510          * Check for illegal addresses.  Watch out for address wrap...
  511          */
  512         map = &td->td_proc->p_vmspace->vm_map;
  513         if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
  514                 return (EINVAL);
  515         vm_map_lock(map);
  516         /*
  517          * Make sure entire range is allocated.
  518          */
  519         if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
  520                 vm_map_unlock(map);
  521                 return (EINVAL);
  522         }
  523         /* returns nothing but KERN_SUCCESS anyway */
  524         vm_map_delete(map, addr, addr + size);
  525         vm_map_unlock(map);
  526         return (0);
  527 }
  528 
  529 #ifndef _SYS_SYSPROTO_H_
  530 struct mprotect_args {
  531         const void *addr;
  532         size_t len;
  533         int prot;
  534 };
  535 #endif
  536 /*
  537  * MPSAFE
  538  */
  539 int
  540 mprotect(td, uap)
  541         struct thread *td;
  542         struct mprotect_args *uap;
  543 {
  544         vm_offset_t addr;
  545         vm_size_t size, pageoff;
  546         vm_prot_t prot;
  547 
  548         addr = (vm_offset_t) uap->addr;
  549         size = uap->len;
  550         prot = uap->prot & VM_PROT_ALL;
  551 #if defined(VM_PROT_READ_IS_EXEC)
  552         if (prot & VM_PROT_READ)
  553                 prot |= VM_PROT_EXECUTE;
  554 #endif
  555 
  556         pageoff = (addr & PAGE_MASK);
  557         addr -= pageoff;
  558         size += pageoff;
  559         size = (vm_size_t) round_page(size);
  560         if (addr + size < addr)
  561                 return (EINVAL);
  562 
  563         switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
  564             addr + size, prot, FALSE)) {
  565         case KERN_SUCCESS:
  566                 return (0);
  567         case KERN_PROTECTION_FAILURE:
  568                 return (EACCES);
  569         }
  570         return (EINVAL);
  571 }
  572 
  573 #ifndef _SYS_SYSPROTO_H_
  574 struct minherit_args {
  575         void *addr;
  576         size_t len;
  577         int inherit;
  578 };
  579 #endif
  580 /*
  581  * MPSAFE
  582  */
  583 int
  584 minherit(td, uap)
  585         struct thread *td;
  586         struct minherit_args *uap;
  587 {
  588         vm_offset_t addr;
  589         vm_size_t size, pageoff;
  590         vm_inherit_t inherit;
  591 
  592         addr = (vm_offset_t)uap->addr;
  593         size = uap->len;
  594         inherit = uap->inherit;
  595 
  596         pageoff = (addr & PAGE_MASK);
  597         addr -= pageoff;
  598         size += pageoff;
  599         size = (vm_size_t) round_page(size);
  600         if (addr + size < addr)
  601                 return (EINVAL);
  602 
  603         switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
  604             addr + size, inherit)) {
  605         case KERN_SUCCESS:
  606                 return (0);
  607         case KERN_PROTECTION_FAILURE:
  608                 return (EACCES);
  609         }
  610         return (EINVAL);
  611 }
  612 
  613 #ifndef _SYS_SYSPROTO_H_
  614 struct madvise_args {
  615         void *addr;
  616         size_t len;
  617         int behav;
  618 };
  619 #endif
  620 
  621 /*
  622  * MPSAFE
  623  */
  624 /* ARGSUSED */
  625 int
  626 madvise(td, uap)
  627         struct thread *td;
  628         struct madvise_args *uap;
  629 {
  630         vm_offset_t start, end;
  631         vm_map_t map;
  632         struct proc *p;
  633         int error;
  634 
  635         /*
  636          * Check for our special case, advising the swap pager we are
  637          * "immortal."
  638          */
  639         if (uap->behav == MADV_PROTECT) {
  640                 error = suser(td);
  641                 if (error == 0) {
  642                         p = td->td_proc;
  643                         PROC_LOCK(p);
  644                         p->p_flag |= P_PROTECTED;
  645                         PROC_UNLOCK(p);
  646                 }
  647                 return (error);
  648         }
  649         /*
  650          * Check for illegal behavior
  651          */
  652         if (uap->behav < 0 || uap->behav > MADV_CORE)
  653                 return (EINVAL);
  654         /*
  655          * Check for illegal addresses.  Watch out for address wrap... Note
  656          * that VM_*_ADDRESS are not constants due to casts (argh).
  657          */
  658         map = &td->td_proc->p_vmspace->vm_map;
  659         if ((vm_offset_t)uap->addr < vm_map_min(map) ||
  660             (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
  661                 return (EINVAL);
  662         if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
  663                 return (EINVAL);
  664 
  665         /*
  666          * Since this routine is only advisory, we default to conservative
  667          * behavior.
  668          */
  669         start = trunc_page((vm_offset_t) uap->addr);
  670         end = round_page((vm_offset_t) uap->addr + uap->len);
  671 
  672         if (vm_map_madvise(map, start, end, uap->behav))
  673                 return (EINVAL);
  674         return (0);
  675 }
  676 
  677 #ifndef _SYS_SYSPROTO_H_
  678 struct mincore_args {
  679         const void *addr;
  680         size_t len;
  681         char *vec;
  682 };
  683 #endif
  684 
  685 /*
  686  * MPSAFE
  687  */
  688 /* ARGSUSED */
  689 int
  690 mincore(td, uap)
  691         struct thread *td;
  692         struct mincore_args *uap;
  693 {
  694         vm_offset_t addr, first_addr;
  695         vm_offset_t end, cend;
  696         pmap_t pmap;
  697         vm_map_t map;
  698         char *vec;
  699         int error = 0;
  700         int vecindex, lastvecindex;
  701         vm_map_entry_t current;
  702         vm_map_entry_t entry;
  703         int mincoreinfo;
  704         unsigned int timestamp;
  705 
  706         /*
  707          * Make sure that the addresses presented are valid for user
  708          * mode.
  709          */
  710         first_addr = addr = trunc_page((vm_offset_t) uap->addr);
  711         end = addr + (vm_size_t)round_page(uap->len);
  712         map = &td->td_proc->p_vmspace->vm_map;
  713         if (end > vm_map_max(map) || end < addr)
  714                 return (EINVAL);
  715 
  716         /*
  717          * Address of byte vector
  718          */
  719         vec = uap->vec;
  720 
  721         pmap = vmspace_pmap(td->td_proc->p_vmspace);
  722 
  723         vm_map_lock_read(map);
  724 RestartScan:
  725         timestamp = map->timestamp;
  726 
  727         if (!vm_map_lookup_entry(map, addr, &entry))
  728                 entry = entry->next;
  729 
  730         /*
  731          * Do this on a map entry basis so that if the pages are not
  732          * in the current processes address space, we can easily look
  733          * up the pages elsewhere.
  734          */
  735         lastvecindex = -1;
  736         for (current = entry;
  737             (current != &map->header) && (current->start < end);
  738             current = current->next) {
  739 
  740                 /*
  741                  * ignore submaps (for now) or null objects
  742                  */
  743                 if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
  744                         current->object.vm_object == NULL)
  745                         continue;
  746 
  747                 /*
  748                  * limit this scan to the current map entry and the
  749                  * limits for the mincore call
  750                  */
  751                 if (addr < current->start)
  752                         addr = current->start;
  753                 cend = current->end;
  754                 if (cend > end)
  755                         cend = end;
  756 
  757                 /*
  758                  * scan this entry one page at a time
  759                  */
  760                 while (addr < cend) {
  761                         /*
  762                          * Check pmap first, it is likely faster, also
  763                          * it can provide info as to whether we are the
  764                          * one referencing or modifying the page.
  765                          */
  766                         mincoreinfo = pmap_mincore(pmap, addr);
  767                         if (!mincoreinfo) {
  768                                 vm_pindex_t pindex;
  769                                 vm_ooffset_t offset;
  770                                 vm_page_t m;
  771                                 /*
  772                                  * calculate the page index into the object
  773                                  */
  774                                 offset = current->offset + (addr - current->start);
  775                                 pindex = OFF_TO_IDX(offset);
  776                                 VM_OBJECT_LOCK(current->object.vm_object);
  777                                 m = vm_page_lookup(current->object.vm_object,
  778                                         pindex);
  779                                 /*
  780                                  * if the page is resident, then gather information about
  781                                  * it.
  782                                  */
  783                                 if (m != NULL && m->valid != 0) {
  784                                         mincoreinfo = MINCORE_INCORE;
  785                                         vm_page_lock_queues();
  786                                         if (m->dirty ||
  787                                                 pmap_is_modified(m))
  788                                                 mincoreinfo |= MINCORE_MODIFIED_OTHER;
  789                                         if ((m->flags & PG_REFERENCED) ||
  790                                                 pmap_ts_referenced(m)) {
  791                                                 vm_page_flag_set(m, PG_REFERENCED);
  792                                                 mincoreinfo |= MINCORE_REFERENCED_OTHER;
  793                                         }
  794                                         vm_page_unlock_queues();
  795                                 }
  796                                 VM_OBJECT_UNLOCK(current->object.vm_object);
  797                         }
  798 
  799                         /*
  800                          * subyte may page fault.  In case it needs to modify
  801                          * the map, we release the lock.
  802                          */
  803                         vm_map_unlock_read(map);
  804 
  805                         /*
  806                          * calculate index into user supplied byte vector
  807                          */
  808                         vecindex = OFF_TO_IDX(addr - first_addr);
  809 
  810                         /*
  811                          * If we have skipped map entries, we need to make sure that
  812                          * the byte vector is zeroed for those skipped entries.
  813                          */
  814                         while ((lastvecindex + 1) < vecindex) {
  815                                 error = subyte(vec + lastvecindex, 0);
  816                                 if (error) {
  817                                         error = EFAULT;
  818                                         goto done2;
  819                                 }
  820                                 ++lastvecindex;
  821                         }
  822 
  823                         /*
  824                          * Pass the page information to the user
  825                          */
  826                         error = subyte(vec + vecindex, mincoreinfo);
  827                         if (error) {
  828                                 error = EFAULT;
  829                                 goto done2;
  830                         }
  831 
  832                         /*
  833                          * If the map has changed, due to the subyte, the previous
  834                          * output may be invalid.
  835                          */
  836                         vm_map_lock_read(map);
  837                         if (timestamp != map->timestamp)
  838                                 goto RestartScan;
  839 
  840                         lastvecindex = vecindex;
  841                         addr += PAGE_SIZE;
  842                 }
  843         }
  844 
  845         /*
  846          * subyte may page fault.  In case it needs to modify
  847          * the map, we release the lock.
  848          */
  849         vm_map_unlock_read(map);
  850 
  851         /*
  852          * Zero the last entries in the byte vector.
  853          */
  854         vecindex = OFF_TO_IDX(end - first_addr);
  855         while ((lastvecindex + 1) < vecindex) {
  856                 error = subyte(vec + lastvecindex, 0);
  857                 if (error) {
  858                         error = EFAULT;
  859                         goto done2;
  860                 }
  861                 ++lastvecindex;
  862         }
  863 
  864         /*
  865          * If the map has changed, due to the subyte, the previous
  866          * output may be invalid.
  867          */
  868         vm_map_lock_read(map);
  869         if (timestamp != map->timestamp)
  870                 goto RestartScan;
  871         vm_map_unlock_read(map);
  872 done2:
  873         return (error);
  874 }
  875 
  876 #ifndef _SYS_SYSPROTO_H_
  877 struct mlock_args {
  878         const void *addr;
  879         size_t len;
  880 };
  881 #endif
  882 /*
  883  * MPSAFE
  884  */
  885 int
  886 mlock(td, uap)
  887         struct thread *td;
  888         struct mlock_args *uap;
  889 {
  890         struct proc *proc;
  891         vm_offset_t addr, end, last, start;
  892         vm_size_t npages, size;
  893         int error;
  894 
  895         error = suser(td);
  896         if (error)
  897                 return (error);
  898         addr = (vm_offset_t)uap->addr;
  899         size = uap->len;
  900         last = addr + size;
  901         start = trunc_page(addr);
  902         end = round_page(last);
  903         if (last < addr || end < addr)
  904                 return (EINVAL);
  905         npages = atop(end - start);
  906         if (npages > vm_page_max_wired)
  907                 return (ENOMEM);
  908         proc = td->td_proc;
  909         PROC_LOCK(proc);
  910         if (ptoa(npages +
  911             pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
  912             lim_cur(proc, RLIMIT_MEMLOCK)) {
  913                 PROC_UNLOCK(proc);
  914                 return (ENOMEM);
  915         }
  916         PROC_UNLOCK(proc);
  917         if (npages + cnt.v_wire_count > vm_page_max_wired)
  918                 return (EAGAIN);
  919         error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
  920             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
  921         return (error == KERN_SUCCESS ? 0 : ENOMEM);
  922 }
  923 
  924 #ifndef _SYS_SYSPROTO_H_
  925 struct mlockall_args {
  926         int     how;
  927 };
  928 #endif
  929 
  930 /*
  931  * MPSAFE
  932  */
  933 int
  934 mlockall(td, uap)
  935         struct thread *td;
  936         struct mlockall_args *uap;
  937 {
  938         vm_map_t map;
  939         int error;
  940 
  941         map = &td->td_proc->p_vmspace->vm_map;
  942         error = 0;
  943 
  944         if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
  945                 return (EINVAL);
  946 
  947 #if 0
  948         /*
  949          * If wiring all pages in the process would cause it to exceed
  950          * a hard resource limit, return ENOMEM.
  951          */
  952         PROC_LOCK(td->td_proc);
  953         if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
  954                 lim_cur(td->td_proc, RLIMIT_MEMLOCK))) {
  955                 PROC_UNLOCK(td->td_proc);
  956                 return (ENOMEM);
  957         }
  958         PROC_UNLOCK(td->td_proc);
  959 #else
  960         error = suser(td);
  961         if (error)
  962                 return (error);
  963 #endif
  964 
  965         if (uap->how & MCL_FUTURE) {
  966                 vm_map_lock(map);
  967                 vm_map_modflags(map, MAP_WIREFUTURE, 0);
  968                 vm_map_unlock(map);
  969                 error = 0;
  970         }
  971 
  972         if (uap->how & MCL_CURRENT) {
  973                 /*
  974                  * P1003.1-2001 mandates that all currently mapped pages
  975                  * will be memory resident and locked (wired) upon return
  976                  * from mlockall(). vm_map_wire() will wire pages, by
  977                  * calling vm_fault_wire() for each page in the region.
  978                  */
  979                 error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
  980                     VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
  981                 error = (error == KERN_SUCCESS ? 0 : EAGAIN);
  982         }
  983 
  984         return (error);
  985 }
  986 
  987 #ifndef _SYS_SYSPROTO_H_
  988 struct munlockall_args {
  989         register_t dummy;
  990 };
  991 #endif
  992 
  993 /*
  994  * MPSAFE
  995  */
  996 int
  997 munlockall(td, uap)
  998         struct thread *td;
  999         struct munlockall_args *uap;
 1000 {
 1001         vm_map_t map;
 1002         int error;
 1003 
 1004         map = &td->td_proc->p_vmspace->vm_map;
 1005         error = suser(td);
 1006         if (error)
 1007                 return (error);
 1008 
 1009         /* Clear the MAP_WIREFUTURE flag from this vm_map. */
 1010         vm_map_lock(map);
 1011         vm_map_modflags(map, 0, MAP_WIREFUTURE);
 1012         vm_map_unlock(map);
 1013 
 1014         /* Forcibly unwire all pages. */
 1015         error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 1016             VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 1017 
 1018         return (error);
 1019 }
 1020 
 1021 #ifndef _SYS_SYSPROTO_H_
 1022 struct munlock_args {
 1023         const void *addr;
 1024         size_t len;
 1025 };
 1026 #endif
 1027 /*
 1028  * MPSAFE
 1029  */
 1030 int
 1031 munlock(td, uap)
 1032         struct thread *td;
 1033         struct munlock_args *uap;
 1034 {
 1035         vm_offset_t addr, end, last, start;
 1036         vm_size_t size;
 1037         int error;
 1038 
 1039         error = suser(td);
 1040         if (error)
 1041                 return (error);
 1042         addr = (vm_offset_t)uap->addr;
 1043         size = uap->len;
 1044         last = addr + size;
 1045         start = trunc_page(addr);
 1046         end = round_page(last);
 1047         if (last < addr || end < addr)
 1048                 return (EINVAL);
 1049         error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 1050             VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 1051         return (error == KERN_SUCCESS ? 0 : ENOMEM);
 1052 }
 1053 
 1054 /*
 1055  * vm_mmap_vnode()
 1056  *
 1057  * MPSAFE
 1058  *
 1059  * Helper function for vm_mmap.  Perform sanity check specific for mmap
 1060  * operations on vnodes.
 1061  */
 1062 int
 1063 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 1064     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
 1065     struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp)
 1066 {
 1067         struct vattr va;
 1068         void *handle;
 1069         vm_object_t obj;
 1070         int error, flags, type;
 1071 
 1072         mtx_lock(&Giant);
 1073         if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
 1074                 mtx_unlock(&Giant);
 1075                 return (error);
 1076         }
 1077         flags = *flagsp;
 1078         if (vp->v_type == VREG) {
 1079                 /*
 1080                  * Get the proper underlying object
 1081                  */
 1082                 if (VOP_GETVOBJECT(vp, &obj) != 0) {
 1083                         error = EINVAL;
 1084                         goto done;
 1085                 }
 1086                 if (obj->handle != vp) {
 1087                         vput(vp);
 1088                         vp = (struct vnode*)obj->handle;
 1089                         vget(vp, LK_EXCLUSIVE, td);
 1090                 }
 1091                 type = OBJT_VNODE;
 1092                 handle = vp;
 1093         } else if (vp->v_type == VCHR) {
 1094                 type = OBJT_DEVICE;
 1095                 handle = vp->v_rdev;
 1096 
 1097                 if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) {
 1098                         *maxprotp = VM_PROT_ALL;
 1099                         *flagsp |= MAP_ANON;
 1100                         error = 0;
 1101                         goto done;
 1102                 }
 1103                 /*
 1104                  * cdevs does not provide private mappings of any kind.
 1105                  */
 1106                 if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 1107                     (prot & PROT_WRITE) != 0) {
 1108                         error = EACCES;
 1109                         goto done;
 1110                 }
 1111                 if (flags & (MAP_PRIVATE|MAP_COPY)) {
 1112                         error = EINVAL;
 1113                         goto done;
 1114                 }
 1115                 /*
 1116                  * Force device mappings to be shared.
 1117                  */
 1118                 flags |= MAP_SHARED;
 1119         } else {
 1120                 error = EINVAL;
 1121                 goto done;
 1122         }
 1123         if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) {
 1124                 goto done;
 1125         }
 1126 #ifdef MAC
 1127         error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags);
 1128         if (error != 0)
 1129                 goto done;
 1130 #endif
 1131         if ((flags & MAP_SHARED) != 0) {
 1132                 if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 1133                         if (prot & PROT_WRITE) {
 1134                                 error = EPERM;
 1135                                 goto done;
 1136                         }
 1137                         *maxprotp &= ~VM_PROT_WRITE;
 1138                 }
 1139         }
 1140         /*
 1141          * If it is a regular file without any references
 1142          * we do not need to sync it.
 1143          * Adjust object size to be the size of actual file.
 1144          */
 1145         if (vp->v_type == VREG) {
 1146                 objsize = round_page(va.va_size);
 1147                 if (va.va_nlink == 0)
 1148                         flags |= MAP_NOSYNC;
 1149         }
 1150         obj = vm_pager_allocate(type, handle, objsize, prot, foff);
 1151         if (obj == NULL) {
 1152                 error = (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 1153                 goto done;
 1154         }
 1155         *objp = obj;
 1156         *flagsp = flags;
 1157 done:
 1158         vput(vp);
 1159         mtx_unlock(&Giant);
 1160         return (error);
 1161 }
 1162 
 1163 /*
 1164  * vm_mmap()
 1165  *
 1166  * MPSAFE
 1167  *
 1168  * Internal version of mmap.  Currently used by mmap, exec, and sys5
 1169  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
 1170  */
 1171 int
 1172 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 1173         vm_prot_t maxprot, int flags,
 1174         void *handle,
 1175         vm_ooffset_t foff)
 1176 {
 1177         boolean_t fitit;
 1178         vm_object_t object;
 1179         int rv = KERN_SUCCESS;
 1180         vm_ooffset_t objsize;
 1181         int docow, error;
 1182         struct thread *td = curthread;
 1183 
 1184         if (size == 0)
 1185                 return (0);
 1186 
 1187         objsize = size = round_page(size);
 1188 
 1189         PROC_LOCK(td->td_proc);
 1190         if (td->td_proc->p_vmspace->vm_map.size + size >
 1191             lim_cur(td->td_proc, RLIMIT_VMEM)) {
 1192                 PROC_UNLOCK(td->td_proc);
 1193                 return(ENOMEM);
 1194         }
 1195         PROC_UNLOCK(td->td_proc);
 1196 
 1197         /*
 1198          * We currently can only deal with page aligned file offsets.
 1199          * The check is here rather than in the syscall because the
 1200          * kernel calls this function internally for other mmaping
 1201          * operations (such as in exec) and non-aligned offsets will
 1202          * cause pmap inconsistencies...so we want to be sure to
 1203          * disallow this in all cases.
 1204          */
 1205         if (foff & PAGE_MASK)
 1206                 return (EINVAL);
 1207 
 1208         if ((flags & MAP_FIXED) == 0) {
 1209                 fitit = TRUE;
 1210                 *addr = round_page(*addr);
 1211         } else {
 1212                 if (*addr != trunc_page(*addr))
 1213                         return (EINVAL);
 1214                 fitit = FALSE;
 1215                 (void) vm_map_remove(map, *addr, *addr + size);
 1216         }
 1217         /*
 1218          * Lookup/allocate object.
 1219          */
 1220         if (handle != NULL) {
 1221                 error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 1222                     handle, foff, &object);
 1223                 if (error) {
 1224                         return (error);
 1225                 }
 1226         }
 1227         if (flags & MAP_ANON) {
 1228                 object = NULL;
 1229                 docow = 0;
 1230                 /*
 1231                  * Unnamed anonymous regions always start at 0.
 1232                  */
 1233                 if (handle == 0)
 1234                         foff = 0;
 1235         } else {
 1236                 docow = MAP_PREFAULT_PARTIAL;
 1237         }
 1238 
 1239         if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 1240                 docow |= MAP_COPY_ON_WRITE;
 1241         if (flags & MAP_NOSYNC)
 1242                 docow |= MAP_DISABLE_SYNCER;
 1243         if (flags & MAP_NOCORE)
 1244                 docow |= MAP_DISABLE_COREDUMP;
 1245 
 1246 #if defined(VM_PROT_READ_IS_EXEC)
 1247         if (prot & VM_PROT_READ)
 1248                 prot |= VM_PROT_EXECUTE;
 1249 
 1250         if (maxprot & VM_PROT_READ)
 1251                 maxprot |= VM_PROT_EXECUTE;
 1252 #endif
 1253 
 1254         if (fitit)
 1255                 *addr = pmap_addr_hint(object, *addr, size);
 1256 
 1257         if (flags & MAP_STACK)
 1258                 rv = vm_map_stack(map, *addr, size, prot, maxprot,
 1259                     docow | MAP_STACK_GROWS_DOWN);
 1260         else
 1261                 rv = vm_map_find(map, object, foff, addr, size, fitit,
 1262                                  prot, maxprot, docow);
 1263 
 1264         if (rv != KERN_SUCCESS) {
 1265                 /*
 1266                  * Lose the object reference. Will destroy the
 1267                  * object if it's an unnamed anonymous mapping
 1268                  * or named anonymous without other references.
 1269                  */
 1270                 vm_object_deallocate(object);
 1271         } else if (flags & MAP_SHARED) {
 1272                 /*
 1273                  * Shared memory is also shared with children.
 1274                  */
 1275                 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 1276                 if (rv != KERN_SUCCESS)
 1277                         (void) vm_map_remove(map, *addr, *addr + size);
 1278         }
 1279 
 1280         /*
 1281          * If the process has requested that all future mappings
 1282          * be wired, then heed this.
 1283          */
 1284         if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
 1285                 vm_map_wire(map, *addr, *addr + size,
 1286                     VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 1287 
 1288         switch (rv) {
 1289         case KERN_SUCCESS:
 1290                 return (0);
 1291         case KERN_INVALID_ADDRESS:
 1292         case KERN_NO_SPACE:
 1293                 return (ENOMEM);
 1294         case KERN_PROTECTION_FAILURE:
 1295                 return (EACCES);
 1296         default:
 1297                 return (EINVAL);
 1298         }
 1299 }

Cache object: 017086f31f0b12543ac837a336e33369


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.