The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_mmap.c,v 1.102.2.1 2007/03/10 12:17:58 bouyer Exp $        */
    2 
    3 /*
    4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    5  * Copyright (c) 1991, 1993 The Regents of the University of California.
    6  * Copyright (c) 1988 University of Utah.
    7  *
    8  * All rights reserved.
    9  *
   10  * This code is derived from software contributed to Berkeley by
   11  * the Systems Programming Group of the University of Utah Computer
   12  * Science Department.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. All advertising materials mentioning features or use of this software
   23  *    must display the following acknowledgement:
   24  *      This product includes software developed by the Charles D. Cranor,
   25  *      Washington University, University of California, Berkeley and
   26  *      its contributors.
   27  * 4. Neither the name of the University nor the names of its contributors
   28  *    may be used to endorse or promote products derived from this software
   29  *    without specific prior written permission.
   30  *
   31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   41  * SUCH DAMAGE.
   42  *
   43  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   44  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
   45  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
   46  */
   47 
   48 /*
   49  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
   50  * function.
   51  */
   52 
   53 #include <sys/cdefs.h>
   54 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.102.2.1 2007/03/10 12:17:58 bouyer Exp $");
   55 
   56 #include "opt_compat_netbsd.h"
   57 #include "opt_pax.h"
   58 #include "veriexec.h"
   59 
   60 #include <sys/param.h>
   61 #include <sys/systm.h>
   62 #include <sys/file.h>
   63 #include <sys/filedesc.h>
   64 #include <sys/resourcevar.h>
   65 #include <sys/mman.h>
   66 #include <sys/mount.h>
   67 #include <sys/proc.h>
   68 #include <sys/malloc.h>
   69 #include <sys/vnode.h>
   70 #include <sys/conf.h>
   71 #include <sys/stat.h>
   72 
   73 #if NVERIEXEC > 0
   74 #include <sys/verified_exec.h>
   75 #endif /* NVERIEXEC > 0 */
   76  
   77 #ifdef PAX_MPROTECT
   78 #include <sys/pax.h>
   79 #endif /* PAX_MPROTECT */
   80 
   81 #include <miscfs/specfs/specdev.h>
   82 
   83 #include <sys/sa.h>
   84 #include <sys/syscallargs.h>
   85 
   86 #include <uvm/uvm.h>
   87 #include <uvm/uvm_device.h>
   88 
   89 #ifndef COMPAT_ZERODEV
   90 #define COMPAT_ZERODEV(dev)     (0)
   91 #endif
   92 
   93 /*
   94  * unimplemented VM system calls:
   95  */
   96 
   97 /*
   98  * sys_sbrk: sbrk system call.
   99  */
  100 
  101 /* ARGSUSED */
  102 int
  103 sys_sbrk(struct lwp *l, void *v, register_t *retval)
  104 {
  105 #if 0
  106         struct sys_sbrk_args /* {
  107                 syscallarg(intptr_t) incr;
  108         } */ *uap = v;
  109 #endif
  110 
  111         return (ENOSYS);
  112 }
  113 
  114 /*
  115  * sys_sstk: sstk system call.
  116  */
  117 
  118 /* ARGSUSED */
  119 int
  120 sys_sstk(struct lwp *l, void *v, register_t *retval)
  121 {
  122 #if 0
  123         struct sys_sstk_args /* {
  124                 syscallarg(int) incr;
  125         } */ *uap = v;
  126 #endif
  127 
  128         return (ENOSYS);
  129 }
  130 
  131 /*
  132  * sys_mincore: determine if pages are in core or not.
  133  */
  134 
  135 /* ARGSUSED */
  136 int
  137 sys_mincore(struct lwp *l, void *v, register_t *retval)
  138 {
  139         struct sys_mincore_args /* {
  140                 syscallarg(void *) addr;
  141                 syscallarg(size_t) len;
  142                 syscallarg(char *) vec;
  143         } */ *uap = v;
  144         struct proc *p = l->l_proc;
  145         struct vm_page *pg;
  146         char *vec, pgi;
  147         struct uvm_object *uobj;
  148         struct vm_amap *amap;
  149         struct vm_anon *anon;
  150         struct vm_map_entry *entry;
  151         vaddr_t start, end, lim;
  152         struct vm_map *map;
  153         vsize_t len;
  154         int error = 0, npgs;
  155 
  156         map = &p->p_vmspace->vm_map;
  157 
  158         start = (vaddr_t)SCARG(uap, addr);
  159         len = SCARG(uap, len);
  160         vec = SCARG(uap, vec);
  161 
  162         if (start & PAGE_MASK)
  163                 return (EINVAL);
  164         len = round_page(len);
  165         end = start + len;
  166         if (end <= start)
  167                 return (EINVAL);
  168 
  169         /*
  170          * Lock down vec, so our returned status isn't outdated by
  171          * storing the status byte for a page.
  172          */
  173 
  174         npgs = len >> PAGE_SHIFT;
  175         error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
  176         if (error) {
  177                 return error;
  178         }
  179         vm_map_lock_read(map);
  180 
  181         if (uvm_map_lookup_entry(map, start, &entry) == FALSE) {
  182                 error = ENOMEM;
  183                 goto out;
  184         }
  185 
  186         for (/* nothing */;
  187              entry != &map->header && entry->start < end;
  188              entry = entry->next) {
  189                 KASSERT(!UVM_ET_ISSUBMAP(entry));
  190                 KASSERT(start >= entry->start);
  191 
  192                 /* Make sure there are no holes. */
  193                 if (entry->end < end &&
  194                      (entry->next == &map->header ||
  195                       entry->next->start > entry->end)) {
  196                         error = ENOMEM;
  197                         goto out;
  198                 }
  199 
  200                 lim = end < entry->end ? end : entry->end;
  201 
  202                 /*
  203                  * Special case for objects with no "real" pages.  Those
  204                  * are always considered resident (mapped devices).
  205                  */
  206 
  207                 if (UVM_ET_ISOBJ(entry)) {
  208                         KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
  209                         if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
  210                                 for (/* nothing */; start < lim;
  211                                      start += PAGE_SIZE, vec++)
  212                                         subyte(vec, 1);
  213                                 continue;
  214                         }
  215                 }
  216 
  217                 amap = entry->aref.ar_amap;     /* top layer */
  218                 uobj = entry->object.uvm_obj;   /* bottom layer */
  219 
  220                 if (amap != NULL)
  221                         amap_lock(amap);
  222                 if (uobj != NULL)
  223                         simple_lock(&uobj->vmobjlock);
  224 
  225                 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
  226                         pgi = 0;
  227                         if (amap != NULL) {
  228                                 /* Check the top layer first. */
  229                                 anon = amap_lookup(&entry->aref,
  230                                     start - entry->start);
  231                                 /* Don't need to lock anon here. */
  232                                 if (anon != NULL && anon->an_page != NULL) {
  233 
  234                                         /*
  235                                          * Anon has the page for this entry
  236                                          * offset.
  237                                          */
  238 
  239                                         pgi = 1;
  240                                 }
  241                         }
  242                         if (uobj != NULL && pgi == 0) {
  243                                 /* Check the bottom layer. */
  244                                 pg = uvm_pagelookup(uobj,
  245                                     entry->offset + (start - entry->start));
  246                                 if (pg != NULL) {
  247 
  248                                         /*
  249                                          * Object has the page for this entry
  250                                          * offset.
  251                                          */
  252 
  253                                         pgi = 1;
  254                                 }
  255                         }
  256                         (void) subyte(vec, pgi);
  257                 }
  258                 if (uobj != NULL)
  259                         simple_unlock(&uobj->vmobjlock);
  260                 if (amap != NULL)
  261                         amap_unlock(amap);
  262         }
  263 
  264  out:
  265         vm_map_unlock_read(map);
  266         uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
  267         return (error);
  268 }
  269 
  270 /*
  271  * sys_mmap: mmap system call.
  272  *
  273  * => file offset and address may not be page aligned
  274  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
  275  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
  276  *      and the return value is adjusted up by the page offset.
  277  */
  278 
  279 int
  280 sys_mmap(l, v, retval)
  281         struct lwp *l;
  282         void *v;
  283         register_t *retval;
  284 {
  285         struct sys_mmap_args /* {
  286                 syscallarg(caddr_t) addr;
  287                 syscallarg(size_t) len;
  288                 syscallarg(int) prot;
  289                 syscallarg(int) flags;
  290                 syscallarg(int) fd;
  291                 syscallarg(long) pad;
  292                 syscallarg(off_t) pos;
  293         } */ *uap = v;
  294         struct proc *p = l->l_proc;
  295         vaddr_t addr;
  296         struct vattr va;
  297         off_t pos;
  298         vsize_t size, pageoff;
  299         vm_prot_t prot, maxprot;
  300         int flags, fd;
  301         vaddr_t vm_min_address = VM_MIN_ADDRESS, defaddr;
  302         struct filedesc *fdp = p->p_fd;
  303         struct file *fp;
  304         struct vnode *vp;
  305         void *handle;
  306         int error;
  307 
  308         /*
  309          * first, extract syscall args from the uap.
  310          */
  311 
  312         addr = (vaddr_t)SCARG(uap, addr);
  313         size = (vsize_t)SCARG(uap, len);
  314         prot = SCARG(uap, prot) & VM_PROT_ALL;
  315         flags = SCARG(uap, flags);
  316         fd = SCARG(uap, fd);
  317         pos = SCARG(uap, pos);
  318 
  319         /*
  320          * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
  321          * validate the flags.
  322          */
  323         if (flags & MAP_COPY)
  324                 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
  325         if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
  326                 return (EINVAL);
  327 
  328         /*
  329          * align file position and save offset.  adjust size.
  330          */
  331 
  332         pageoff = (pos & PAGE_MASK);
  333         pos  -= pageoff;
  334         size += pageoff;                        /* add offset */
  335         size = (vsize_t)round_page(size);       /* round up */
  336         if ((ssize_t) size < 0)
  337                 return (EINVAL);                        /* don't allow wrap */
  338 
  339         /*
  340          * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
  341          */
  342 
  343         if (flags & MAP_FIXED) {
  344 
  345                 /* ensure address and file offset are aligned properly */
  346                 addr -= pageoff;
  347                 if (addr & PAGE_MASK)
  348                         return (EINVAL);
  349 
  350                 if (VM_MAXUSER_ADDRESS > 0 &&
  351                     (addr + size) > VM_MAXUSER_ADDRESS)
  352                         return (EFBIG);
  353                 if (vm_min_address > 0 && addr < vm_min_address)
  354                         return (EINVAL);
  355                 if (addr > addr + size)
  356                         return (EOVERFLOW);             /* no wrapping! */
  357 
  358         } else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
  359 
  360                 /*
  361                  * not fixed: make sure we skip over the largest
  362                  * possible heap for non-topdown mapping arrangements.
  363                  * we will refine our guess later (e.g. to account for
  364                  * VAC, etc)
  365                  */
  366 
  367                 defaddr = p->p_emul->e_vm_default_addr(p,
  368                     (vaddr_t)p->p_vmspace->vm_daddr, size);
  369 
  370                 if (addr == 0 ||
  371                     !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
  372                         addr = MAX(addr, defaddr);
  373                 else
  374                         addr = MIN(addr, defaddr);
  375         }
  376 
  377         /*
  378          * check for file mappings (i.e. not anonymous) and verify file.
  379          */
  380 
  381         if ((flags & MAP_ANON) == 0) {
  382 
  383                 if ((fp = fd_getfile(fdp, fd)) == NULL)
  384                         return (EBADF);
  385 
  386                 simple_unlock(&fp->f_slock);
  387 
  388                 if (fp->f_type != DTYPE_VNODE)
  389                         return (ENODEV);                /* only mmap vnodes! */
  390                 vp = (struct vnode *)fp->f_data;        /* convert to vnode */
  391 
  392                 if (vp->v_type != VREG && vp->v_type != VCHR &&
  393                     vp->v_type != VBLK)
  394                         return (ENODEV);  /* only REG/CHR/BLK support mmap */
  395 
  396                 if (vp->v_type != VCHR && pos < 0)
  397                         return (EINVAL);
  398 
  399                 if (vp->v_type != VCHR && (pos + size) < pos)
  400                         return (EOVERFLOW);             /* no offset wrapping */
  401 
  402                 /* special case: catch SunOS style /dev/zero */
  403                 if (vp->v_type == VCHR
  404                     && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
  405                         flags |= MAP_ANON;
  406                         goto is_anon;
  407                 }
  408 
  409                 /*
  410                  * Old programs may not select a specific sharing type, so
  411                  * default to an appropriate one.
  412                  *
  413                  * XXX: how does MAP_ANON fit in the picture?
  414                  */
  415                 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
  416 #if defined(DEBUG)
  417                         printf("WARNING: defaulted mmap() share type to "
  418                            "%s (pid %d command %s)\n", vp->v_type == VCHR ?
  419                            "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
  420                             p->p_comm);
  421 #endif
  422                         if (vp->v_type == VCHR)
  423                                 flags |= MAP_SHARED;    /* for a device */
  424                         else
  425                                 flags |= MAP_PRIVATE;   /* for a file */
  426                 }
  427 
  428                 /*
  429                  * MAP_PRIVATE device mappings don't make sense (and aren't
  430                  * supported anyway).  However, some programs rely on this,
  431                  * so just change it to MAP_SHARED.
  432                  */
  433                 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
  434                         flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
  435                 }
  436 
  437                 /*
  438                  * now check protection
  439                  */
  440 
  441                 maxprot = VM_PROT_EXECUTE;
  442 
  443 #if NVERIEXEC > 0
  444                 /*
  445                  * Check if the file can be executed indirectly.
  446                  */
  447                 if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT, NULL)) {
  448                         /*
  449                          * Don't allow executable mappings if we can't
  450                          * indirectly execute the file.
  451                          */
  452                         if (prot & VM_PROT_EXECUTE)
  453                                 return (EPERM);
  454 
  455                         /*
  456                          * Strip the executable bit from 'maxprot' to make sure
  457                          * it can't be made executable later.
  458                          */
  459                         maxprot &= ~VM_PROT_EXECUTE;
  460                 }
  461 #endif /* NVERIEXEC > 0 */
  462 
  463                 /* check read access */
  464                 if (fp->f_flag & FREAD)
  465                         maxprot |= VM_PROT_READ;
  466                 else if (prot & PROT_READ)
  467                         return (EACCES);
  468 
  469                 /* check write access, shared case first */
  470                 if (flags & MAP_SHARED) {
  471                         /*
  472                          * if the file is writable, only add PROT_WRITE to
  473                          * maxprot if the file is not immutable, append-only.
  474                          * otherwise, if we have asked for PROT_WRITE, return
  475                          * EPERM.
  476                          */
  477                         if (fp->f_flag & FWRITE) {
  478                                 if ((error =
  479                                     VOP_GETATTR(vp, &va, l->l_cred, l)))
  480                                         return (error);
  481                                 if ((va.va_flags &
  482                                     (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
  483                                         maxprot |= VM_PROT_WRITE;
  484                                 else if (prot & PROT_WRITE)
  485                                         return (EPERM);
  486                         }
  487                         else if (prot & PROT_WRITE)
  488                                 return (EACCES);
  489                 } else {
  490                         /* MAP_PRIVATE mappings can always write to */
  491                         maxprot |= VM_PROT_WRITE;
  492                 }
  493                 handle = vp;
  494 
  495         } else {                /* MAP_ANON case */
  496                 /*
  497                  * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
  498                  */
  499                 if (fd != -1)
  500                         return (EINVAL);
  501 
  502  is_anon:               /* label for SunOS style /dev/zero */
  503                 handle = NULL;
  504                 maxprot = VM_PROT_ALL;
  505                 pos = 0;
  506         }
  507 
  508         /*
  509          * XXX (in)sanity check.  We don't do proper datasize checking
  510          * XXX for anonymous (or private writable) mmap().  However,
  511          * XXX know that if we're trying to allocate more than the amount
  512          * XXX remaining under our current data size limit, _that_ should
  513          * XXX be disallowed.
  514          */
  515         if ((flags & MAP_ANON) != 0 ||
  516             ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) {
  517                 if (size >
  518                     (p->p_rlimit[RLIMIT_DATA].rlim_cur -
  519                      ctob(p->p_vmspace->vm_dsize))) {
  520                         return (ENOMEM);
  521                 }
  522         }
  523 
  524 #ifdef PAX_MPROTECT
  525         pax_mprotect(l, &prot, &maxprot);
  526 #endif /* PAX_MPROTECT */
  527 
  528         /*
  529          * now let kernel internal function uvm_mmap do the work.
  530          */
  531 
  532         error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
  533             flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
  534 
  535         if (error == 0)
  536                 /* remember to add offset */
  537                 *retval = (register_t)(addr + pageoff);
  538 
  539         return (error);
  540 }
  541 
  542 /*
  543  * sys___msync13: the msync system call (a front-end for flush)
  544  */
  545 
  546 int
  547 sys___msync13(struct lwp *l, void *v, register_t *retval)
  548 {
  549         struct sys___msync13_args /* {
  550                 syscallarg(caddr_t) addr;
  551                 syscallarg(size_t) len;
  552                 syscallarg(int) flags;
  553         } */ *uap = v;
  554         struct proc *p = l->l_proc;
  555         vaddr_t addr;
  556         vsize_t size, pageoff;
  557         struct vm_map *map;
  558         int error, rv, flags, uvmflags;
  559 
  560         /*
  561          * extract syscall args from the uap
  562          */
  563 
  564         addr = (vaddr_t)SCARG(uap, addr);
  565         size = (vsize_t)SCARG(uap, len);
  566         flags = SCARG(uap, flags);
  567 
  568         /* sanity check flags */
  569         if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
  570             (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
  571             (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
  572                 return (EINVAL);
  573         if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
  574                 flags |= MS_SYNC;
  575 
  576         /*
  577          * align the address to a page boundary and adjust the size accordingly.
  578          */
  579 
  580         pageoff = (addr & PAGE_MASK);
  581         addr -= pageoff;
  582         size += pageoff;
  583         size = (vsize_t)round_page(size);
  584 
  585         /* disallow wrap-around. */
  586         if (addr + size < addr)
  587                 return (EINVAL);
  588 
  589         /*
  590          * get map
  591          */
  592 
  593         map = &p->p_vmspace->vm_map;
  594 
  595         /*
  596          * XXXCDC: do we really need this semantic?
  597          *
  598          * XXX Gak!  If size is zero we are supposed to sync "all modified
  599          * pages with the region containing addr".  Unfortunately, we
  600          * don't really keep track of individual mmaps so we approximate
  601          * by flushing the range of the map entry containing addr.
  602          * This can be incorrect if the region splits or is coalesced
  603          * with a neighbor.
  604          */
  605 
  606         if (size == 0) {
  607                 struct vm_map_entry *entry;
  608 
  609                 vm_map_lock_read(map);
  610                 rv = uvm_map_lookup_entry(map, addr, &entry);
  611                 if (rv == TRUE) {
  612                         addr = entry->start;
  613                         size = entry->end - entry->start;
  614                 }
  615                 vm_map_unlock_read(map);
  616                 if (rv == FALSE)
  617                         return (EINVAL);
  618         }
  619 
  620         /*
  621          * translate MS_ flags into PGO_ flags
  622          */
  623 
  624         uvmflags = PGO_CLEANIT;
  625         if (flags & MS_INVALIDATE)
  626                 uvmflags |= PGO_FREE;
  627         if (flags & MS_SYNC)
  628                 uvmflags |= PGO_SYNCIO;
  629 
  630         error = uvm_map_clean(map, addr, addr+size, uvmflags);
  631         return error;
  632 }
  633 
  634 /*
  635  * sys_munmap: unmap a users memory
  636  */
  637 
  638 int
  639 sys_munmap(struct lwp *l, void *v, register_t *retval)
  640 {
  641         struct sys_munmap_args /* {
  642                 syscallarg(caddr_t) addr;
  643                 syscallarg(size_t) len;
  644         } */ *uap = v;
  645         struct proc *p = l->l_proc;
  646         vaddr_t addr;
  647         vsize_t size, pageoff;
  648         struct vm_map *map;
  649         vaddr_t vm_min_address = VM_MIN_ADDRESS;
  650         struct vm_map_entry *dead_entries;
  651 
  652         /*
  653          * get syscall args.
  654          */
  655 
  656         addr = (vaddr_t)SCARG(uap, addr);
  657         size = (vsize_t)SCARG(uap, len);
  658 
  659         /*
  660          * align the address to a page boundary and adjust the size accordingly.
  661          */
  662 
  663         pageoff = (addr & PAGE_MASK);
  664         addr -= pageoff;
  665         size += pageoff;
  666         size = (vsize_t)round_page(size);
  667 
  668         if ((int)size < 0)
  669                 return (EINVAL);
  670         if (size == 0)
  671                 return (0);
  672 
  673         /*
  674          * Check for illegal addresses.  Watch out for address wrap...
  675          * Note that VM_*_ADDRESS are not constants due to casts (argh).
  676          */
  677         if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
  678                 return (EINVAL);
  679         if (vm_min_address > 0 && addr < vm_min_address)
  680                 return (EINVAL);
  681         if (addr > addr + size)
  682                 return (EINVAL);
  683         map = &p->p_vmspace->vm_map;
  684 
  685         /*
  686          * interesting system call semantic: make sure entire range is
  687          * allocated before allowing an unmap.
  688          */
  689 
  690         vm_map_lock(map);
  691 #if 0
  692         if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
  693                 vm_map_unlock(map);
  694                 return (EINVAL);
  695         }
  696 #endif
  697         uvm_unmap_remove(map, addr, addr + size, &dead_entries, NULL, 0);
  698         vm_map_unlock(map);
  699         if (dead_entries != NULL)
  700                 uvm_unmap_detach(dead_entries, 0);
  701         return (0);
  702 }
  703 
  704 /*
  705  * sys_mprotect: the mprotect system call
  706  */
  707 
  708 int
  709 sys_mprotect(struct lwp *l, void *v, register_t *retval)
  710 {
  711         struct sys_mprotect_args /* {
  712                 syscallarg(caddr_t) addr;
  713                 syscallarg(size_t) len;
  714                 syscallarg(int) prot;
  715         } */ *uap = v;
  716         struct proc *p = l->l_proc;
  717         vaddr_t addr;
  718         vsize_t size, pageoff;
  719         vm_prot_t prot;
  720         int error;
  721 
  722         /*
  723          * extract syscall args from uap
  724          */
  725 
  726         addr = (vaddr_t)SCARG(uap, addr);
  727         size = (vsize_t)SCARG(uap, len);
  728         prot = SCARG(uap, prot) & VM_PROT_ALL;
  729 
  730         /*
  731          * align the address to a page boundary and adjust the size accordingly.
  732          */
  733 
  734         pageoff = (addr & PAGE_MASK);
  735         addr -= pageoff;
  736         size += pageoff;
  737         size = round_page(size);
  738 
  739         error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
  740                                 FALSE);
  741         return error;
  742 }
  743 
  744 /*
  745  * sys_minherit: the minherit system call
  746  */
  747 
  748 int
  749 sys_minherit(struct lwp *l, void *v, register_t *retval)
  750 {
  751         struct sys_minherit_args /* {
  752                 syscallarg(caddr_t) addr;
  753                 syscallarg(int) len;
  754                 syscallarg(int) inherit;
  755         } */ *uap = v;
  756         struct proc *p = l->l_proc;
  757         vaddr_t addr;
  758         vsize_t size, pageoff;
  759         vm_inherit_t inherit;
  760         int error;
  761 
  762         addr = (vaddr_t)SCARG(uap, addr);
  763         size = (vsize_t)SCARG(uap, len);
  764         inherit = SCARG(uap, inherit);
  765 
  766         /*
  767          * align the address to a page boundary and adjust the size accordingly.
  768          */
  769 
  770         pageoff = (addr & PAGE_MASK);
  771         addr -= pageoff;
  772         size += pageoff;
  773         size = (vsize_t)round_page(size);
  774 
  775         if ((int)size < 0)
  776                 return (EINVAL);
  777         error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
  778                                 inherit);
  779         return error;
  780 }
  781 
  782 /*
  783  * sys_madvise: give advice about memory usage.
  784  */
  785 
  786 /* ARGSUSED */
  787 int
  788 sys_madvise(struct lwp *l, void *v, register_t *retval)
  789 {
  790         struct sys_madvise_args /* {
  791                 syscallarg(caddr_t) addr;
  792                 syscallarg(size_t) len;
  793                 syscallarg(int) behav;
  794         } */ *uap = v;
  795         struct proc *p = l->l_proc;
  796         vaddr_t addr;
  797         vsize_t size, pageoff;
  798         int advice, error;
  799 
  800         addr = (vaddr_t)SCARG(uap, addr);
  801         size = (vsize_t)SCARG(uap, len);
  802         advice = SCARG(uap, behav);
  803 
  804         /*
  805          * align the address to a page boundary, and adjust the size accordingly
  806          */
  807 
  808         pageoff = (addr & PAGE_MASK);
  809         addr -= pageoff;
  810         size += pageoff;
  811         size = (vsize_t)round_page(size);
  812 
  813         if ((ssize_t)size <= 0)
  814                 return (EINVAL);
  815 
  816         switch (advice) {
  817         case MADV_NORMAL:
  818         case MADV_RANDOM:
  819         case MADV_SEQUENTIAL:
  820                 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
  821                     advice);
  822                 break;
  823 
  824         case MADV_WILLNEED:
  825 
  826                 /*
  827                  * Activate all these pages, pre-faulting them in if
  828                  * necessary.
  829                  */
  830                 /*
  831                  * XXX IMPLEMENT ME.
  832                  * Should invent a "weak" mode for uvm_fault()
  833                  * which would only do the PGO_LOCKED pgo_get().
  834                  */
  835 
  836                 return (0);
  837 
  838         case MADV_DONTNEED:
  839 
  840                 /*
  841                  * Deactivate all these pages.  We don't need them
  842                  * any more.  We don't, however, toss the data in
  843                  * the pages.
  844                  */
  845 
  846                 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
  847                     PGO_DEACTIVATE);
  848                 break;
  849 
  850         case MADV_FREE:
  851 
  852                 /*
  853                  * These pages contain no valid data, and may be
  854                  * garbage-collected.  Toss all resources, including
  855                  * any swap space in use.
  856                  */
  857 
  858                 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
  859                     PGO_FREE);
  860                 break;
  861 
  862         case MADV_SPACEAVAIL:
  863 
  864                 /*
  865                  * XXXMRG What is this?  I think it's:
  866                  *
  867                  *      Ensure that we have allocated backing-store
  868                  *      for these pages.
  869                  *
  870                  * This is going to require changes to the page daemon,
  871                  * as it will free swap space allocated to pages in core.
  872                  * There's also what to do for device/file/anonymous memory.
  873                  */
  874 
  875                 return (EINVAL);
  876 
  877         default:
  878                 return (EINVAL);
  879         }
  880 
  881         return error;
  882 }
  883 
  884 /*
  885  * sys_mlock: memory lock
  886  */
  887 
  888 int
  889 sys_mlock(struct lwp *l, void *v, register_t *retval)
  890 {
  891         struct sys_mlock_args /* {
  892                 syscallarg(const void *) addr;
  893                 syscallarg(size_t) len;
  894         } */ *uap = v;
  895         struct proc *p = l->l_proc;
  896         vaddr_t addr;
  897         vsize_t size, pageoff;
  898         int error;
  899 
  900         /*
  901          * extract syscall args from uap
  902          */
  903 
  904         addr = (vaddr_t)SCARG(uap, addr);
  905         size = (vsize_t)SCARG(uap, len);
  906 
  907         /*
  908          * align the address to a page boundary and adjust the size accordingly
  909          */
  910 
  911         pageoff = (addr & PAGE_MASK);
  912         addr -= pageoff;
  913         size += pageoff;
  914         size = (vsize_t)round_page(size);
  915 
  916         /* disallow wrap-around. */
  917         if (addr + size < addr)
  918                 return (EINVAL);
  919 
  920         if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
  921                 return (EAGAIN);
  922 
  923         if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
  924                         p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
  925                 return (EAGAIN);
  926 
  927         error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE,
  928             0);
  929         if (error == EFAULT)
  930                 error = ENOMEM;
  931         return error;
  932 }
  933 
  934 /*
  935  * sys_munlock: unlock wired pages
  936  */
  937 
  938 int
  939 sys_munlock(struct lwp *l, void *v, register_t *retval)
  940 {
  941         struct sys_munlock_args /* {
  942                 syscallarg(const void *) addr;
  943                 syscallarg(size_t) len;
  944         } */ *uap = v;
  945         struct proc *p = l->l_proc;
  946         vaddr_t addr;
  947         vsize_t size, pageoff;
  948         int error;
  949 
  950         /*
  951          * extract syscall args from uap
  952          */
  953 
  954         addr = (vaddr_t)SCARG(uap, addr);
  955         size = (vsize_t)SCARG(uap, len);
  956 
  957         /*
  958          * align the address to a page boundary, and adjust the size accordingly
  959          */
  960 
  961         pageoff = (addr & PAGE_MASK);
  962         addr -= pageoff;
  963         size += pageoff;
  964         size = (vsize_t)round_page(size);
  965 
  966         /* disallow wrap-around. */
  967         if (addr + size < addr)
  968                 return (EINVAL);
  969 
  970         error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE,
  971             0);
  972         if (error == EFAULT)
  973                 error = ENOMEM;
  974         return error;
  975 }
  976 
  977 /*
  978  * sys_mlockall: lock all pages mapped into an address space.
  979  */
  980 
  981 int
  982 sys_mlockall(struct lwp *l, void *v, register_t *retval)
  983 {
  984         struct sys_mlockall_args /* {
  985                 syscallarg(int) flags;
  986         } */ *uap = v;
  987         struct proc *p = l->l_proc;
  988         int error, flags;
  989 
  990         flags = SCARG(uap, flags);
  991 
  992         if (flags == 0 ||
  993             (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
  994                 return (EINVAL);
  995 
  996         error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
  997             p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
  998         return (error);
  999 }
 1000 
 1001 /*
 1002  * sys_munlockall: unlock all pages mapped into an address space.
 1003  */
 1004 
 1005 int
 1006 sys_munlockall(struct lwp *l, void *v, register_t *retval)
 1007 {
 1008         struct proc *p = l->l_proc;
 1009 
 1010         (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
 1011         return (0);
 1012 }
 1013 
 1014 /*
 1015  * uvm_mmap: internal version of mmap
 1016  *
 1017  * - used by sys_mmap and various framebuffers
 1018  * - handle is a vnode pointer or NULL for MAP_ANON
 1019  * - caller must page-align the file offset
 1020  */
 1021 
 1022 int
 1023 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
 1024         struct vm_map *map;
 1025         vaddr_t *addr;
 1026         vsize_t size;
 1027         vm_prot_t prot, maxprot;
 1028         int flags;
 1029         void *handle;
 1030         voff_t foff;
 1031         vsize_t locklimit;
 1032 {
 1033         struct uvm_object *uobj;
 1034         struct vnode *vp;
 1035         vaddr_t align = 0;
 1036         int error;
 1037         int advice = UVM_ADV_NORMAL;
 1038         uvm_flag_t uvmflag = 0;
 1039         boolean_t needwritemap;
 1040 
 1041         /*
 1042          * check params
 1043          */
 1044 
 1045         if (size == 0)
 1046                 return(0);
 1047         if (foff & PAGE_MASK)
 1048                 return(EINVAL);
 1049         if ((prot & maxprot) != prot)
 1050                 return(EINVAL);
 1051 
 1052         /*
 1053          * for non-fixed mappings, round off the suggested address.
 1054          * for fixed mappings, check alignment and zap old mappings.
 1055          */
 1056 
 1057         if ((flags & MAP_FIXED) == 0) {
 1058                 *addr = round_page(*addr);
 1059         } else {
 1060                 if (*addr & PAGE_MASK)
 1061                         return(EINVAL);
 1062                 uvmflag |= UVM_FLAG_FIXED;
 1063                 (void) uvm_unmap(map, *addr, *addr + size);
 1064         }
 1065 
 1066         /*
 1067          * Try to see if any requested alignment can even be attemped.
 1068          * Make sure we can express the alignment (asking for a >= 4GB
 1069          * alignment on an ILP32 architecure make no sense) and the
 1070          * alignment is at least for a page sized quanitiy.  If the
 1071          * request was for a fixed mapping, make sure supplied address
 1072          * adheres to the request alignment.
 1073          */
 1074         align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
 1075         if (align) {
 1076                 if (align >= sizeof(vaddr_t) * NBBY)
 1077                         return(EINVAL);
 1078                 align = 1L << align;
 1079                 if (align < PAGE_SIZE)
 1080                         return(EINVAL);
 1081                 if (align >= vm_map_max(map))
 1082                         return(ENOMEM);
 1083                 if (flags & MAP_FIXED) {
 1084                         if ((*addr & (align-1)) != 0)
 1085                                 return(EINVAL);
 1086                         align = 0;
 1087                 }
 1088         }
 1089 
 1090         /*
 1091          * handle anon vs. non-anon mappings.   for non-anon mappings attach
 1092          * to underlying vm object.
 1093          */
 1094 
 1095         if (flags & MAP_ANON) {
 1096                 KASSERT(handle == NULL);
 1097                 foff = UVM_UNKNOWN_OFFSET;
 1098                 uobj = NULL;
 1099                 if ((flags & MAP_SHARED) == 0)
 1100                         /* XXX: defer amap create */
 1101                         uvmflag |= UVM_FLAG_COPYONW;
 1102                 else
 1103                         /* shared: create amap now */
 1104                         uvmflag |= UVM_FLAG_OVERLAY;
 1105 
 1106         } else {
 1107                 KASSERT(handle != NULL);
 1108                 vp = (struct vnode *)handle;
 1109 
 1110                 /*
 1111                  * Don't allow mmap for EXEC if the file system
 1112                  * is mounted NOEXEC.
 1113                  */
 1114                 if ((prot & PROT_EXEC) != 0 &&
 1115                     (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0)
 1116                         return (EACCES);
 1117 
 1118                 if (vp->v_type != VCHR) {
 1119                         error = VOP_MMAP(vp, 0, curlwp->l_cred, curlwp);
 1120                         if (error) {
 1121                                 return error;
 1122                         }
 1123 
 1124                         uobj = uvn_attach((void *)vp, (flags & MAP_SHARED) ?
 1125                            maxprot : (maxprot & ~VM_PROT_WRITE));
 1126 
 1127                         /* XXX for now, attach doesn't gain a ref */
 1128                         VREF(vp);
 1129 
 1130                         /*
 1131                          * If the vnode is being mapped with PROT_EXEC,
 1132                          * then mark it as text.
 1133                          */
 1134                         if (prot & PROT_EXEC)
 1135                                 vn_markexec(vp);
 1136                 } else {
 1137                         int i = maxprot;
 1138 
 1139                         /*
 1140                          * XXX Some devices don't like to be mapped with
 1141                          * XXX PROT_EXEC or PROT_WRITE, but we don't really
 1142                          * XXX have a better way of handling this, right now
 1143                          */
 1144                         do {
 1145                                 uobj = udv_attach((void *) &vp->v_rdev,
 1146                                     (flags & MAP_SHARED) ? i :
 1147                                     (i & ~VM_PROT_WRITE), foff, size);
 1148                                 i--;
 1149                         } while ((uobj == NULL) && (i > 0));
 1150                         advice = UVM_ADV_RANDOM;
 1151                 }
 1152                 if (uobj == NULL)
 1153                         return((vp->v_type == VREG) ? ENOMEM : EINVAL);
 1154                 if ((flags & MAP_SHARED) == 0) {
 1155                         uvmflag |= UVM_FLAG_COPYONW;
 1156                 }
 1157 
 1158                 /*
 1159                  * Set vnode flags to indicate the new kinds of mapping.
 1160                  * We take the vnode lock in exclusive mode here to serialize
 1161                  * with direct I/O.
 1162                  */
 1163 
 1164                 needwritemap = (vp->v_flag & VWRITEMAP) == 0 &&
 1165                         (flags & MAP_SHARED) != 0 &&
 1166                         (maxprot & VM_PROT_WRITE) != 0;
 1167                 if ((vp->v_flag & VMAPPED) == 0 || needwritemap) {
 1168                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1169                         simple_lock(&vp->v_interlock);
 1170                         vp->v_flag |= VMAPPED;
 1171                         if (needwritemap) {
 1172                                 vp->v_flag |= VWRITEMAP;
 1173                         }
 1174                         simple_unlock(&vp->v_interlock);
 1175                         VOP_UNLOCK(vp, 0);
 1176                 }
 1177         }
 1178 
 1179         uvmflag = UVM_MAPFLAG(prot, maxprot,
 1180                         (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
 1181                         advice, uvmflag);
 1182         error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
 1183         if (error) {
 1184                 if (uobj)
 1185                         uobj->pgops->pgo_detach(uobj);
 1186                 return error;
 1187         }
 1188 
 1189         /*
 1190          * POSIX 1003.1b -- if our address space was configured
 1191          * to lock all future mappings, wire the one we just made.
 1192          *
 1193          * Also handle the MAP_WIRED flag here.
 1194          */
 1195 
 1196         if (prot == VM_PROT_NONE) {
 1197 
 1198                 /*
 1199                  * No more work to do in this case.
 1200                  */
 1201 
 1202                 return (0);
 1203         }
 1204         vm_map_lock(map);
 1205         if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
 1206                 if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
 1207                     (locklimit != 0 &&
 1208                      size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
 1209                      locklimit)) {
 1210                         vm_map_unlock(map);
 1211                         uvm_unmap(map, *addr, *addr + size);
 1212                         return ENOMEM;
 1213                 }
 1214 
 1215                 /*
 1216                  * uvm_map_pageable() always returns the map unlocked.
 1217                  */
 1218 
 1219                 error = uvm_map_pageable(map, *addr, *addr + size,
 1220                                          FALSE, UVM_LK_ENTER);
 1221                 if (error) {
 1222                         uvm_unmap(map, *addr, *addr + size);
 1223                         return error;
 1224                 }
 1225                 return (0);
 1226         }
 1227         vm_map_unlock(map);
 1228         return 0;
 1229 }
 1230 
 1231 vaddr_t
 1232 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz)
 1233 {
 1234 
 1235         return VM_DEFAULT_ADDRESS(base, sz);
 1236 }

Cache object: 6c09872530b229061289716306a90037


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.