The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/uvm/uvm_mmap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: uvm_mmap.c,v 1.126.8.1 2009/04/01 00:25:23 snj Exp $   */
    2 
    3 /*
    4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
    5  * Copyright (c) 1991, 1993 The Regents of the University of California.
    6  * Copyright (c) 1988 University of Utah.
    7  *
    8  * All rights reserved.
    9  *
   10  * This code is derived from software contributed to Berkeley by
   11  * the Systems Programming Group of the University of Utah Computer
   12  * Science Department.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. All advertising materials mentioning features or use of this software
   23  *    must display the following acknowledgement:
   24  *      This product includes software developed by the Charles D. Cranor,
   25  *      Washington University, University of California, Berkeley and
   26  *      its contributors.
   27  * 4. Neither the name of the University nor the names of its contributors
   28  *    may be used to endorse or promote products derived from this software
   29  *    without specific prior written permission.
   30  *
   31  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   32  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   33  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   34  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   35  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   36  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   37  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   38  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   39  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   40  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   41  * SUCH DAMAGE.
   42  *
   43  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
   44  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
   45  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
   46  */
   47 
   48 /*
   49  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
   50  * function.
   51  */
   52 
   53 #include <sys/cdefs.h>
   54 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.126.8.1 2009/04/01 00:25:23 snj Exp $");
   55 
   56 #include "opt_compat_netbsd.h"
   57 #include "opt_pax.h"
   58 #include "veriexec.h"
   59 
   60 #include <sys/param.h>
   61 #include <sys/systm.h>
   62 #include <sys/file.h>
   63 #include <sys/filedesc.h>
   64 #include <sys/resourcevar.h>
   65 #include <sys/mman.h>
   66 #include <sys/mount.h>
   67 #include <sys/proc.h>
   68 #include <sys/malloc.h>
   69 #include <sys/vnode.h>
   70 #include <sys/conf.h>
   71 #include <sys/stat.h>
   72 
   73 #if NVERIEXEC > 0
   74 #include <sys/verified_exec.h>
   75 #endif /* NVERIEXEC > 0 */
   76  
   77 #ifdef PAX_MPROTECT
   78 #include <sys/pax.h>
   79 #endif /* PAX_MPROTECT */
   80 
   81 #include <miscfs/specfs/specdev.h>
   82 
   83 #include <sys/syscallargs.h>
   84 
   85 #include <uvm/uvm.h>
   86 #include <uvm/uvm_device.h>
   87 
   88 #ifndef COMPAT_ZERODEV
   89 #define COMPAT_ZERODEV(dev)     (0)
   90 #endif
   91 
   92 static int
   93 range_test(vaddr_t addr, vsize_t size, bool ismmap)
   94 {
   95         vaddr_t vm_min_address = VM_MIN_ADDRESS;
   96         vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
   97         vaddr_t eaddr = addr + size;
   98 
   99         if (addr < vm_min_address)
  100                 return EINVAL;
  101         if (eaddr > vm_max_address)
  102                 return ismmap ? EFBIG : EINVAL;
  103         if (addr > eaddr) /* no wrapping! */
  104                 return ismmap ? EOVERFLOW : EINVAL;
  105         return 0;
  106 }
  107 
  108 /*
  109  * unimplemented VM system calls:
  110  */
  111 
  112 /*
  113  * sys_sbrk: sbrk system call.
  114  */
  115 
  116 /* ARGSUSED */
  117 int
  118 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
  119 {
  120         /* {
  121                 syscallarg(intptr_t) incr;
  122         } */
  123 
  124         return (ENOSYS);
  125 }
  126 
  127 /*
  128  * sys_sstk: sstk system call.
  129  */
  130 
  131 /* ARGSUSED */
  132 int
  133 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
  134 {
  135         /* {
  136                 syscallarg(int) incr;
  137         } */
  138 
  139         return (ENOSYS);
  140 }
  141 
  142 /*
  143  * sys_mincore: determine if pages are in core or not.
  144  */
  145 
  146 /* ARGSUSED */
  147 int
  148 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, register_t *retval)
  149 {
  150         /* {
  151                 syscallarg(void *) addr;
  152                 syscallarg(size_t) len;
  153                 syscallarg(char *) vec;
  154         } */
  155         struct proc *p = l->l_proc;
  156         struct vm_page *pg;
  157         char *vec, pgi;
  158         struct uvm_object *uobj;
  159         struct vm_amap *amap;
  160         struct vm_anon *anon;
  161         struct vm_map_entry *entry;
  162         vaddr_t start, end, lim;
  163         struct vm_map *map;
  164         vsize_t len;
  165         int error = 0, npgs;
  166 
  167         map = &p->p_vmspace->vm_map;
  168 
  169         start = (vaddr_t)SCARG(uap, addr);
  170         len = SCARG(uap, len);
  171         vec = SCARG(uap, vec);
  172 
  173         if (start & PAGE_MASK)
  174                 return (EINVAL);
  175         len = round_page(len);
  176         end = start + len;
  177         if (end <= start)
  178                 return (EINVAL);
  179 
  180         /*
  181          * Lock down vec, so our returned status isn't outdated by
  182          * storing the status byte for a page.
  183          */
  184 
  185         npgs = len >> PAGE_SHIFT;
  186         error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
  187         if (error) {
  188                 return error;
  189         }
  190         vm_map_lock_read(map);
  191 
  192         if (uvm_map_lookup_entry(map, start, &entry) == false) {
  193                 error = ENOMEM;
  194                 goto out;
  195         }
  196 
  197         for (/* nothing */;
  198              entry != &map->header && entry->start < end;
  199              entry = entry->next) {
  200                 KASSERT(!UVM_ET_ISSUBMAP(entry));
  201                 KASSERT(start >= entry->start);
  202 
  203                 /* Make sure there are no holes. */
  204                 if (entry->end < end &&
  205                      (entry->next == &map->header ||
  206                       entry->next->start > entry->end)) {
  207                         error = ENOMEM;
  208                         goto out;
  209                 }
  210 
  211                 lim = end < entry->end ? end : entry->end;
  212 
  213                 /*
  214                  * Special case for objects with no "real" pages.  Those
  215                  * are always considered resident (mapped devices).
  216                  */
  217 
  218                 if (UVM_ET_ISOBJ(entry)) {
  219                         KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
  220                         if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
  221                                 for (/* nothing */; start < lim;
  222                                      start += PAGE_SIZE, vec++)
  223                                         subyte(vec, 1);
  224                                 continue;
  225                         }
  226                 }
  227 
  228                 amap = entry->aref.ar_amap;     /* top layer */
  229                 uobj = entry->object.uvm_obj;   /* bottom layer */
  230 
  231                 if (amap != NULL)
  232                         amap_lock(amap);
  233                 if (uobj != NULL)
  234                         mutex_enter(&uobj->vmobjlock);
  235 
  236                 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
  237                         pgi = 0;
  238                         if (amap != NULL) {
  239                                 /* Check the top layer first. */
  240                                 anon = amap_lookup(&entry->aref,
  241                                     start - entry->start);
  242                                 /* Don't need to lock anon here. */
  243                                 if (anon != NULL && anon->an_page != NULL) {
  244 
  245                                         /*
  246                                          * Anon has the page for this entry
  247                                          * offset.
  248                                          */
  249 
  250                                         pgi = 1;
  251                                 }
  252                         }
  253                         if (uobj != NULL && pgi == 0) {
  254                                 /* Check the bottom layer. */
  255                                 pg = uvm_pagelookup(uobj,
  256                                     entry->offset + (start - entry->start));
  257                                 if (pg != NULL) {
  258 
  259                                         /*
  260                                          * Object has the page for this entry
  261                                          * offset.
  262                                          */
  263 
  264                                         pgi = 1;
  265                                 }
  266                         }
  267                         (void) subyte(vec, pgi);
  268                 }
  269                 if (uobj != NULL)
  270                         mutex_exit(&uobj->vmobjlock);
  271                 if (amap != NULL)
  272                         amap_unlock(amap);
  273         }
  274 
  275  out:
  276         vm_map_unlock_read(map);
  277         uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
  278         return (error);
  279 }
  280 
  281 /*
  282  * sys_mmap: mmap system call.
  283  *
  284  * => file offset and address may not be page aligned
  285  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
  286  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
  287  *      and the return value is adjusted up by the page offset.
  288  */
  289 
  290 int
  291 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
  292 {
  293         /* {
  294                 syscallarg(void *) addr;
  295                 syscallarg(size_t) len;
  296                 syscallarg(int) prot;
  297                 syscallarg(int) flags;
  298                 syscallarg(int) fd;
  299                 syscallarg(long) pad;
  300                 syscallarg(off_t) pos;
  301         } */
  302         struct proc *p = l->l_proc;
  303         vaddr_t addr;
  304         struct vattr va;
  305         off_t pos;
  306         vsize_t size, pageoff;
  307         vm_prot_t prot, maxprot;
  308         int flags, fd;
  309         vaddr_t defaddr;
  310         struct file *fp = NULL;
  311         struct vnode *vp;
  312         void *handle;
  313         int error;
  314 #ifdef PAX_ASLR
  315         vaddr_t orig_addr;
  316 #endif /* PAX_ASLR */
  317 
  318         /*
  319          * first, extract syscall args from the uap.
  320          */
  321 
  322         addr = (vaddr_t)SCARG(uap, addr);
  323         size = (vsize_t)SCARG(uap, len);
  324         prot = SCARG(uap, prot) & VM_PROT_ALL;
  325         flags = SCARG(uap, flags);
  326         fd = SCARG(uap, fd);
  327         pos = SCARG(uap, pos);
  328 
  329 #ifdef PAX_ASLR
  330         orig_addr = addr;
  331 #endif /* PAX_ASLR */
  332 
  333         /*
  334          * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
  335          * validate the flags.
  336          */
  337         if (flags & MAP_COPY)
  338                 flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
  339         if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
  340                 return (EINVAL);
  341 
  342         /*
  343          * align file position and save offset.  adjust size.
  344          */
  345 
  346         pageoff = (pos & PAGE_MASK);
  347         pos  -= pageoff;
  348         size += pageoff;                        /* add offset */
  349         size = (vsize_t)round_page(size);       /* round up */
  350 
  351         /*
  352          * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
  353          */
  354         if (flags & MAP_FIXED) {
  355 
  356                 /* ensure address and file offset are aligned properly */
  357                 addr -= pageoff;
  358                 if (addr & PAGE_MASK)
  359                         return (EINVAL);
  360 
  361                 error = range_test(addr, size, true);
  362                 if (error)
  363                         return error;
  364         } else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
  365 
  366                 /*
  367                  * not fixed: make sure we skip over the largest
  368                  * possible heap for non-topdown mapping arrangements.
  369                  * we will refine our guess later (e.g. to account for
  370                  * VAC, etc)
  371                  */
  372 
  373                 defaddr = p->p_emul->e_vm_default_addr(p,
  374                     (vaddr_t)p->p_vmspace->vm_daddr, size);
  375 
  376                 if (addr == 0 ||
  377                     !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
  378                         addr = MAX(addr, defaddr);
  379                 else
  380                         addr = MIN(addr, defaddr);
  381         }
  382 
  383         /*
  384          * check for file mappings (i.e. not anonymous) and verify file.
  385          */
  386 
  387         if ((flags & MAP_ANON) == 0) {
  388                 if ((fp = fd_getfile(fd)) == NULL)
  389                         return (EBADF);
  390                 if (fp->f_type != DTYPE_VNODE) {
  391                         fd_putfile(fd);
  392                         return (ENODEV);                /* only mmap vnodes! */
  393                 }
  394                 vp = fp->f_data;                /* convert to vnode */
  395                 if (vp->v_type != VREG && vp->v_type != VCHR &&
  396                     vp->v_type != VBLK) {
  397                         fd_putfile(fd);
  398                         return (ENODEV);  /* only REG/CHR/BLK support mmap */
  399                 }
  400                 if (vp->v_type != VCHR && pos < 0) {
  401                         fd_putfile(fd);
  402                         return (EINVAL);
  403                 }
  404                 if (vp->v_type != VCHR && (pos + size) < pos) {
  405                         fd_putfile(fd);
  406                         return (EOVERFLOW);             /* no offset wrapping */
  407                 }
  408 
  409                 /* special case: catch SunOS style /dev/zero */
  410                 if (vp->v_type == VCHR
  411                     && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
  412                         flags |= MAP_ANON;
  413                         fd_putfile(fd);
  414                         fp = NULL;
  415                         goto is_anon;
  416                 }
  417 
  418                 /*
  419                  * Old programs may not select a specific sharing type, so
  420                  * default to an appropriate one.
  421                  *
  422                  * XXX: how does MAP_ANON fit in the picture?
  423                  */
  424                 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
  425 #if defined(DEBUG)
  426                         printf("WARNING: defaulted mmap() share type to "
  427                            "%s (pid %d command %s)\n", vp->v_type == VCHR ?
  428                            "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
  429                             p->p_comm);
  430 #endif
  431                         if (vp->v_type == VCHR)
  432                                 flags |= MAP_SHARED;    /* for a device */
  433                         else
  434                                 flags |= MAP_PRIVATE;   /* for a file */
  435                 }
  436 
  437                 /*
  438                  * MAP_PRIVATE device mappings don't make sense (and aren't
  439                  * supported anyway).  However, some programs rely on this,
  440                  * so just change it to MAP_SHARED.
  441                  */
  442                 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
  443                         flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
  444                 }
  445 
  446                 /*
  447                  * now check protection
  448                  */
  449 
  450                 maxprot = VM_PROT_EXECUTE;
  451 
  452                 /* check read access */
  453                 if (fp->f_flag & FREAD)
  454                         maxprot |= VM_PROT_READ;
  455                 else if (prot & PROT_READ) {
  456                         fd_putfile(fd);
  457                         return (EACCES);
  458                 }
  459 
  460                 /* check write access, shared case first */
  461                 if (flags & MAP_SHARED) {
  462                         /*
  463                          * if the file is writable, only add PROT_WRITE to
  464                          * maxprot if the file is not immutable, append-only.
  465                          * otherwise, if we have asked for PROT_WRITE, return
  466                          * EPERM.
  467                          */
  468                         if (fp->f_flag & FWRITE) {
  469                                 if ((error =
  470                                     VOP_GETATTR(vp, &va, l->l_cred))) {
  471                                         fd_putfile(fd);
  472                                         return (error);
  473                                 }
  474                                 if ((va.va_flags &
  475                                     (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
  476                                         maxprot |= VM_PROT_WRITE;
  477                                 else if (prot & PROT_WRITE) {
  478                                         fd_putfile(fd);
  479                                         return (EPERM);
  480                                 }
  481                         }
  482                         else if (prot & PROT_WRITE) {
  483                                 fd_putfile(fd);
  484                                 return (EACCES);
  485                         }
  486                 } else {
  487                         /* MAP_PRIVATE mappings can always write to */
  488                         maxprot |= VM_PROT_WRITE;
  489                 }
  490                 handle = vp;
  491 
  492         } else {                /* MAP_ANON case */
  493                 /*
  494                  * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
  495                  */
  496                 if (fd != -1)
  497                         return (EINVAL);
  498 
  499  is_anon:               /* label for SunOS style /dev/zero */
  500                 handle = NULL;
  501                 maxprot = VM_PROT_ALL;
  502                 pos = 0;
  503         }
  504 
  505 #if NVERIEXEC > 0
  506         if (handle != NULL) {
  507                 /*
  508                  * Check if the file can be executed indirectly.
  509                  *
  510                  * XXX: This gives false warnings about "Incorrect access type"
  511                  * XXX: if the mapping is not executable. Harmless, but will be
  512                  * XXX: fixed as part of other changes.
  513                  */
  514                 if (veriexec_verify(l, handle, "(mmap)", VERIEXEC_INDIRECT,
  515                     NULL)) {
  516                         /*
  517                          * Don't allow executable mappings if we can't
  518                          * indirectly execute the file.
  519                          */
  520                         if (prot & VM_PROT_EXECUTE) {
  521                                 if (fp != NULL)
  522                                         fd_putfile(fd);
  523                                 return (EPERM);
  524                         }
  525 
  526                         /*
  527                          * Strip the executable bit from 'maxprot' to make sure
  528                          * it can't be made executable later.
  529                          */
  530                         maxprot &= ~VM_PROT_EXECUTE;
  531                 }
  532         }
  533 #endif /* NVERIEXEC > 0 */
  534 
  535 #ifdef PAX_MPROTECT
  536         pax_mprotect(l, &prot, &maxprot);
  537 #endif /* PAX_MPROTECT */
  538 
  539 #ifdef PAX_ASLR
  540         pax_aslr(l, &addr, orig_addr, flags);
  541 #endif /* PAX_ASLR */
  542 
  543         /*
  544          * now let kernel internal function uvm_mmap do the work.
  545          */
  546 
  547         error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
  548             flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
  549 
  550         if (error == 0)
  551                 /* remember to add offset */
  552                 *retval = (register_t)(addr + pageoff);
  553 
  554         if (fp != NULL)
  555                 fd_putfile(fd);
  556 
  557         return (error);
  558 }
  559 
  560 /*
  561  * sys___msync13: the msync system call (a front-end for flush)
  562  */
  563 
  564 int
  565 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap, register_t *retval)
  566 {
  567         /* {
  568                 syscallarg(void *) addr;
  569                 syscallarg(size_t) len;
  570                 syscallarg(int) flags;
  571         } */
  572         struct proc *p = l->l_proc;
  573         vaddr_t addr;
  574         vsize_t size, pageoff;
  575         struct vm_map *map;
  576         int error, rv, flags, uvmflags;
  577 
  578         /*
  579          * extract syscall args from the uap
  580          */
  581 
  582         addr = (vaddr_t)SCARG(uap, addr);
  583         size = (vsize_t)SCARG(uap, len);
  584         flags = SCARG(uap, flags);
  585 
  586         /* sanity check flags */
  587         if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
  588             (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
  589             (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
  590                 return (EINVAL);
  591         if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
  592                 flags |= MS_SYNC;
  593 
  594         /*
  595          * align the address to a page boundary and adjust the size accordingly.
  596          */
  597 
  598         pageoff = (addr & PAGE_MASK);
  599         addr -= pageoff;
  600         size += pageoff;
  601         size = (vsize_t)round_page(size);
  602 
  603         error = range_test(addr, size, false);
  604         if (error)
  605                 return error;
  606 
  607         /*
  608          * get map
  609          */
  610 
  611         map = &p->p_vmspace->vm_map;
  612 
  613         /*
  614          * XXXCDC: do we really need this semantic?
  615          *
  616          * XXX Gak!  If size is zero we are supposed to sync "all modified
  617          * pages with the region containing addr".  Unfortunately, we
  618          * don't really keep track of individual mmaps so we approximate
  619          * by flushing the range of the map entry containing addr.
  620          * This can be incorrect if the region splits or is coalesced
  621          * with a neighbor.
  622          */
  623 
  624         if (size == 0) {
  625                 struct vm_map_entry *entry;
  626 
  627                 vm_map_lock_read(map);
  628                 rv = uvm_map_lookup_entry(map, addr, &entry);
  629                 if (rv == true) {
  630                         addr = entry->start;
  631                         size = entry->end - entry->start;
  632                 }
  633                 vm_map_unlock_read(map);
  634                 if (rv == false)
  635                         return (EINVAL);
  636         }
  637 
  638         /*
  639          * translate MS_ flags into PGO_ flags
  640          */
  641 
  642         uvmflags = PGO_CLEANIT;
  643         if (flags & MS_INVALIDATE)
  644                 uvmflags |= PGO_FREE;
  645         if (flags & MS_SYNC)
  646                 uvmflags |= PGO_SYNCIO;
  647 
  648         error = uvm_map_clean(map, addr, addr+size, uvmflags);
  649         return error;
  650 }
  651 
  652 /*
  653  * sys_munmap: unmap a users memory
  654  */
  655 
  656 int
  657 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
  658 {
  659         /* {
  660                 syscallarg(void *) addr;
  661                 syscallarg(size_t) len;
  662         } */
  663         struct proc *p = l->l_proc;
  664         vaddr_t addr;
  665         vsize_t size, pageoff;
  666         struct vm_map *map;
  667         struct vm_map_entry *dead_entries;
  668         int error;
  669 
  670         /*
  671          * get syscall args.
  672          */
  673 
  674         addr = (vaddr_t)SCARG(uap, addr);
  675         size = (vsize_t)SCARG(uap, len);
  676 
  677         /*
  678          * align the address to a page boundary and adjust the size accordingly.
  679          */
  680 
  681         pageoff = (addr & PAGE_MASK);
  682         addr -= pageoff;
  683         size += pageoff;
  684         size = (vsize_t)round_page(size);
  685 
  686         if (size == 0)
  687                 return (0);
  688 
  689         error = range_test(addr, size, false);
  690         if (error)
  691                 return error;
  692 
  693         map = &p->p_vmspace->vm_map;
  694 
  695         /*
  696          * interesting system call semantic: make sure entire range is
  697          * allocated before allowing an unmap.
  698          */
  699 
  700         vm_map_lock(map);
  701 #if 0
  702         if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
  703                 vm_map_unlock(map);
  704                 return (EINVAL);
  705         }
  706 #endif
  707         uvm_unmap_remove(map, addr, addr + size, &dead_entries, NULL, 0);
  708         vm_map_unlock(map);
  709         if (dead_entries != NULL)
  710                 uvm_unmap_detach(dead_entries, 0);
  711         return (0);
  712 }
  713 
  714 /*
  715  * sys_mprotect: the mprotect system call
  716  */
  717 
  718 int
  719 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap, register_t *retval)
  720 {
  721         /* {
  722                 syscallarg(void *) addr;
  723                 syscallarg(size_t) len;
  724                 syscallarg(int) prot;
  725         } */
  726         struct proc *p = l->l_proc;
  727         vaddr_t addr;
  728         vsize_t size, pageoff;
  729         vm_prot_t prot;
  730         int error;
  731 
  732         /*
  733          * extract syscall args from uap
  734          */
  735 
  736         addr = (vaddr_t)SCARG(uap, addr);
  737         size = (vsize_t)SCARG(uap, len);
  738         prot = SCARG(uap, prot) & VM_PROT_ALL;
  739 
  740         /*
  741          * align the address to a page boundary and adjust the size accordingly.
  742          */
  743 
  744         pageoff = (addr & PAGE_MASK);
  745         addr -= pageoff;
  746         size += pageoff;
  747         size = round_page(size);
  748 
  749         error = range_test(addr, size, false);
  750         if (error)
  751                 return error;
  752 
  753         error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
  754                                 false);
  755         return error;
  756 }
  757 
  758 /*
  759  * sys_minherit: the minherit system call
  760  */
  761 
  762 int
  763 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap, register_t *retval)
  764 {
  765         /* {
  766                 syscallarg(void *) addr;
  767                 syscallarg(int) len;
  768                 syscallarg(int) inherit;
  769         } */
  770         struct proc *p = l->l_proc;
  771         vaddr_t addr;
  772         vsize_t size, pageoff;
  773         vm_inherit_t inherit;
  774         int error;
  775 
  776         addr = (vaddr_t)SCARG(uap, addr);
  777         size = (vsize_t)SCARG(uap, len);
  778         inherit = SCARG(uap, inherit);
  779 
  780         /*
  781          * align the address to a page boundary and adjust the size accordingly.
  782          */
  783 
  784         pageoff = (addr & PAGE_MASK);
  785         addr -= pageoff;
  786         size += pageoff;
  787         size = (vsize_t)round_page(size);
  788 
  789         error = range_test(addr, size, false);
  790         if (error)
  791                 return error;
  792 
  793         error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
  794                                 inherit);
  795         return error;
  796 }
  797 
  798 /*
  799  * sys_madvise: give advice about memory usage.
  800  */
  801 
  802 /* ARGSUSED */
  803 int
  804 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap, register_t *retval)
  805 {
  806         /* {
  807                 syscallarg(void *) addr;
  808                 syscallarg(size_t) len;
  809                 syscallarg(int) behav;
  810         } */
  811         struct proc *p = l->l_proc;
  812         vaddr_t addr;
  813         vsize_t size, pageoff;
  814         int advice, error;
  815 
  816         addr = (vaddr_t)SCARG(uap, addr);
  817         size = (vsize_t)SCARG(uap, len);
  818         advice = SCARG(uap, behav);
  819 
  820         /*
  821          * align the address to a page boundary, and adjust the size accordingly
  822          */
  823 
  824         pageoff = (addr & PAGE_MASK);
  825         addr -= pageoff;
  826         size += pageoff;
  827         size = (vsize_t)round_page(size);
  828 
  829         error = range_test(addr, size, false);
  830         if (error)
  831                 return error;
  832 
  833         switch (advice) {
  834         case MADV_NORMAL:
  835         case MADV_RANDOM:
  836         case MADV_SEQUENTIAL:
  837                 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
  838                     advice);
  839                 break;
  840 
  841         case MADV_WILLNEED:
  842 
  843                 /*
  844                  * Activate all these pages, pre-faulting them in if
  845                  * necessary.
  846                  */
  847                 /*
  848                  * XXX IMPLEMENT ME.
  849                  * Should invent a "weak" mode for uvm_fault()
  850                  * which would only do the PGO_LOCKED pgo_get().
  851                  */
  852 
  853                 return (0);
  854 
  855         case MADV_DONTNEED:
  856 
  857                 /*
  858                  * Deactivate all these pages.  We don't need them
  859                  * any more.  We don't, however, toss the data in
  860                  * the pages.
  861                  */
  862 
  863                 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
  864                     PGO_DEACTIVATE);
  865                 break;
  866 
  867         case MADV_FREE:
  868 
  869                 /*
  870                  * These pages contain no valid data, and may be
  871                  * garbage-collected.  Toss all resources, including
  872                  * any swap space in use.
  873                  */
  874 
  875                 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
  876                     PGO_FREE);
  877                 break;
  878 
  879         case MADV_SPACEAVAIL:
  880 
  881                 /*
  882                  * XXXMRG What is this?  I think it's:
  883                  *
  884                  *      Ensure that we have allocated backing-store
  885                  *      for these pages.
  886                  *
  887                  * This is going to require changes to the page daemon,
  888                  * as it will free swap space allocated to pages in core.
  889                  * There's also what to do for device/file/anonymous memory.
  890                  */
  891 
  892                 return (EINVAL);
  893 
  894         default:
  895                 return (EINVAL);
  896         }
  897 
  898         return error;
  899 }
  900 
  901 /*
  902  * sys_mlock: memory lock
  903  */
  904 
  905 int
  906 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
  907 {
  908         /* {
  909                 syscallarg(const void *) addr;
  910                 syscallarg(size_t) len;
  911         } */
  912         struct proc *p = l->l_proc;
  913         vaddr_t addr;
  914         vsize_t size, pageoff;
  915         int error;
  916 
  917         /*
  918          * extract syscall args from uap
  919          */
  920 
  921         addr = (vaddr_t)SCARG(uap, addr);
  922         size = (vsize_t)SCARG(uap, len);
  923 
  924         /*
  925          * align the address to a page boundary and adjust the size accordingly
  926          */
  927 
  928         pageoff = (addr & PAGE_MASK);
  929         addr -= pageoff;
  930         size += pageoff;
  931         size = (vsize_t)round_page(size);
  932 
  933         error = range_test(addr, size, false);
  934         if (error)
  935                 return error;
  936 
  937         if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
  938                 return (EAGAIN);
  939 
  940         if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
  941                         p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
  942                 return (EAGAIN);
  943 
  944         error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
  945             0);
  946         if (error == EFAULT)
  947                 error = ENOMEM;
  948         return error;
  949 }
  950 
  951 /*
  952  * sys_munlock: unlock wired pages
  953  */
  954 
  955 int
  956 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap, register_t *retval)
  957 {
  958         /* {
  959                 syscallarg(const void *) addr;
  960                 syscallarg(size_t) len;
  961         } */
  962         struct proc *p = l->l_proc;
  963         vaddr_t addr;
  964         vsize_t size, pageoff;
  965         int error;
  966 
  967         /*
  968          * extract syscall args from uap
  969          */
  970 
  971         addr = (vaddr_t)SCARG(uap, addr);
  972         size = (vsize_t)SCARG(uap, len);
  973 
  974         /*
  975          * align the address to a page boundary, and adjust the size accordingly
  976          */
  977 
  978         pageoff = (addr & PAGE_MASK);
  979         addr -= pageoff;
  980         size += pageoff;
  981         size = (vsize_t)round_page(size);
  982 
  983         error = range_test(addr, size, false);
  984         if (error)
  985                 return error;
  986 
  987         error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
  988             0);
  989         if (error == EFAULT)
  990                 error = ENOMEM;
  991         return error;
  992 }
  993 
  994 /*
  995  * sys_mlockall: lock all pages mapped into an address space.
  996  */
  997 
  998 int
  999 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap, register_t *retval)
 1000 {
 1001         /* {
 1002                 syscallarg(int) flags;
 1003         } */
 1004         struct proc *p = l->l_proc;
 1005         int error, flags;
 1006 
 1007         flags = SCARG(uap, flags);
 1008 
 1009         if (flags == 0 ||
 1010             (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
 1011                 return (EINVAL);
 1012 
 1013         error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
 1014             p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
 1015         return (error);
 1016 }
 1017 
 1018 /*
 1019  * sys_munlockall: unlock all pages mapped into an address space.
 1020  */
 1021 
 1022 int
 1023 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
 1024 {
 1025         struct proc *p = l->l_proc;
 1026 
 1027         (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
 1028         return (0);
 1029 }
 1030 
 1031 /*
 1032  * uvm_mmap: internal version of mmap
 1033  *
 1034  * - used by sys_mmap and various framebuffers
 1035  * - handle is a vnode pointer or NULL for MAP_ANON
 1036  * - caller must page-align the file offset
 1037  */
 1038 
 1039 int
 1040 uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit)
 1041         struct vm_map *map;
 1042         vaddr_t *addr;
 1043         vsize_t size;
 1044         vm_prot_t prot, maxprot;
 1045         int flags;
 1046         void *handle;
 1047         voff_t foff;
 1048         vsize_t locklimit;
 1049 {
 1050         struct uvm_object *uobj;
 1051         struct vnode *vp;
 1052         vaddr_t align = 0;
 1053         int error;
 1054         int advice = UVM_ADV_NORMAL;
 1055         uvm_flag_t uvmflag = 0;
 1056         bool needwritemap;
 1057 
 1058         /*
 1059          * check params
 1060          */
 1061 
 1062         if (size == 0)
 1063                 return(0);
 1064         if (foff & PAGE_MASK)
 1065                 return(EINVAL);
 1066         if ((prot & maxprot) != prot)
 1067                 return(EINVAL);
 1068 
 1069         /*
 1070          * for non-fixed mappings, round off the suggested address.
 1071          * for fixed mappings, check alignment and zap old mappings.
 1072          */
 1073 
 1074         if ((flags & MAP_FIXED) == 0) {
 1075                 *addr = round_page(*addr);
 1076         } else {
 1077                 if (*addr & PAGE_MASK)
 1078                         return(EINVAL);
 1079                 uvmflag |= UVM_FLAG_FIXED;
 1080                 (void) uvm_unmap(map, *addr, *addr + size);
 1081         }
 1082 
 1083         /*
 1084          * Try to see if any requested alignment can even be attemped.
 1085          * Make sure we can express the alignment (asking for a >= 4GB
 1086          * alignment on an ILP32 architecure make no sense) and the
 1087          * alignment is at least for a page sized quanitiy.  If the
 1088          * request was for a fixed mapping, make sure supplied address
 1089          * adheres to the request alignment.
 1090          */
 1091         align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
 1092         if (align) {
 1093                 if (align >= sizeof(vaddr_t) * NBBY)
 1094                         return(EINVAL);
 1095                 align = 1L << align;
 1096                 if (align < PAGE_SIZE)
 1097                         return(EINVAL);
 1098                 if (align >= vm_map_max(map))
 1099                         return(ENOMEM);
 1100                 if (flags & MAP_FIXED) {
 1101                         if ((*addr & (align-1)) != 0)
 1102                                 return(EINVAL);
 1103                         align = 0;
 1104                 }
 1105         }
 1106 
 1107         /*
 1108          * check resource limits
 1109          */
 1110 
 1111         if (!VM_MAP_IS_KERNEL(map) &&
 1112             (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
 1113             curproc->p_rlimit[RLIMIT_AS].rlim_cur))
 1114                 return ENOMEM;
 1115 
 1116         /*
 1117          * handle anon vs. non-anon mappings.   for non-anon mappings attach
 1118          * to underlying vm object.
 1119          */
 1120 
 1121         if (flags & MAP_ANON) {
 1122                 KASSERT(handle == NULL);
 1123                 foff = UVM_UNKNOWN_OFFSET;
 1124                 uobj = NULL;
 1125                 if ((flags & MAP_SHARED) == 0)
 1126                         /* XXX: defer amap create */
 1127                         uvmflag |= UVM_FLAG_COPYONW;
 1128                 else
 1129                         /* shared: create amap now */
 1130                         uvmflag |= UVM_FLAG_OVERLAY;
 1131 
 1132         } else {
 1133                 KASSERT(handle != NULL);
 1134                 vp = (struct vnode *)handle;
 1135 
 1136                 /*
 1137                  * Don't allow mmap for EXEC if the file system
 1138                  * is mounted NOEXEC.
 1139                  */
 1140                 if ((prot & PROT_EXEC) != 0 &&
 1141                     (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0)
 1142                         return (EACCES);
 1143 
 1144                 if (vp->v_type != VCHR) {
 1145                         error = VOP_MMAP(vp, prot, curlwp->l_cred);
 1146                         if (error) {
 1147                                 return error;
 1148                         }
 1149                         vref(vp);
 1150                         uobj = &vp->v_uobj;
 1151 
 1152                         /*
 1153                          * If the vnode is being mapped with PROT_EXEC,
 1154                          * then mark it as text.
 1155                          */
 1156                         if (prot & PROT_EXEC) {
 1157                                 vn_markexec(vp);
 1158                         }
 1159                 } else {
 1160                         int i = maxprot;
 1161 
 1162                         /*
 1163                          * XXX Some devices don't like to be mapped with
 1164                          * XXX PROT_EXEC or PROT_WRITE, but we don't really
 1165                          * XXX have a better way of handling this, right now
 1166                          */
 1167                         do {
 1168                                 uobj = udv_attach((void *) &vp->v_rdev,
 1169                                     (flags & MAP_SHARED) ? i :
 1170                                     (i & ~VM_PROT_WRITE), foff, size);
 1171                                 i--;
 1172                         } while ((uobj == NULL) && (i > 0));
 1173                         advice = UVM_ADV_RANDOM;
 1174                 }
 1175                 if (uobj == NULL)
 1176                         return((vp->v_type == VREG) ? ENOMEM : EINVAL);
 1177                 if ((flags & MAP_SHARED) == 0) {
 1178                         uvmflag |= UVM_FLAG_COPYONW;
 1179                 }
 1180 
 1181                 /*
 1182                  * Set vnode flags to indicate the new kinds of mapping.
 1183                  * We take the vnode lock in exclusive mode here to serialize
 1184                  * with direct I/O.
 1185                  *
 1186                  * Safe to check for these flag values without a lock, as
 1187                  * long as a reference to the vnode is held.
 1188                  */
 1189                 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
 1190                         (flags & MAP_SHARED) != 0 &&
 1191                         (maxprot & VM_PROT_WRITE) != 0;
 1192                 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
 1193                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1194                         vp->v_vflag |= VV_MAPPED;
 1195                         if (needwritemap) {
 1196                                 mutex_enter(&vp->v_interlock);
 1197                                 vp->v_iflag |= VI_WRMAP;
 1198                                 mutex_exit(&vp->v_interlock);
 1199                         }
 1200                         VOP_UNLOCK(vp, 0);
 1201                 }
 1202         }
 1203 
 1204         uvmflag = UVM_MAPFLAG(prot, maxprot,
 1205                         (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
 1206                         advice, uvmflag);
 1207         error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
 1208         if (error) {
 1209                 if (uobj)
 1210                         uobj->pgops->pgo_detach(uobj);
 1211                 return error;
 1212         }
 1213 
 1214         /*
 1215          * POSIX 1003.1b -- if our address space was configured
 1216          * to lock all future mappings, wire the one we just made.
 1217          *
 1218          * Also handle the MAP_WIRED flag here.
 1219          */
 1220 
 1221         if (prot == VM_PROT_NONE) {
 1222 
 1223                 /*
 1224                  * No more work to do in this case.
 1225                  */
 1226 
 1227                 return (0);
 1228         }
 1229         if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
 1230                 vm_map_lock(map);
 1231                 if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
 1232                     (locklimit != 0 &&
 1233                      size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
 1234                      locklimit)) {
 1235                         vm_map_unlock(map);
 1236                         uvm_unmap(map, *addr, *addr + size);
 1237                         return ENOMEM;
 1238                 }
 1239 
 1240                 /*
 1241                  * uvm_map_pageable() always returns the map unlocked.
 1242                  */
 1243 
 1244                 error = uvm_map_pageable(map, *addr, *addr + size,
 1245                                          false, UVM_LK_ENTER);
 1246                 if (error) {
 1247                         uvm_unmap(map, *addr, *addr + size);
 1248                         return error;
 1249                 }
 1250                 return (0);
 1251         }
 1252         return 0;
 1253 }
 1254 
 1255 vaddr_t
 1256 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz)
 1257 {
 1258 
 1259         return VM_DEFAULT_ADDRESS(base, sz);
 1260 }

Cache object: 50a42e0c467204bf38641737c32b617d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.