uipc_shm.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
    5  * Copyright 2020 The FreeBSD Foundation
    6  * All rights reserved.
    7  *
    8  * Portions of this software were developed by BAE Systems, the University of
    9  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
   10  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
   11  * Computing (TC) research program.
   12  *
   13  * Portions of this software were developed by Konstantin Belousov
   14  * under sponsorship from the FreeBSD Foundation.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  */
   37 
   38 /*
   39  * Support for shared swap-backed anonymous memory objects via
   40  * shm_open(2), shm_rename(2), and shm_unlink(2).
   41  * While most of the implementation is here, vm_mmap.c contains
   42  * mapping logic changes.
   43  *
   44  * posixshmcontrol(1) allows users to inspect the state of the memory
   45  * objects.  Per-uid swap resource limit controls total amount of
   46  * memory that user can consume for anonymous objects, including
   47  * shared.
   48  */
   49 
   50 #include <sys/cdefs.h>
   51 __FBSDID("$FreeBSD$");
   52 
   53 #include "opt_capsicum.h"
   54 #include "opt_ktrace.h"
   55 
   56 #include <sys/param.h>
   57 #include <sys/capsicum.h>
   58 #include <sys/conf.h>
   59 #include <sys/fcntl.h>
   60 #include <sys/file.h>
   61 #include <sys/filedesc.h>
   62 #include <sys/filio.h>
   63 #include <sys/fnv_hash.h>
   64 #include <sys/kernel.h>
   65 #include <sys/limits.h>
   66 #include <sys/uio.h>
   67 #include <sys/signal.h>
   68 #include <sys/jail.h>
   69 #include <sys/ktrace.h>
   70 #include <sys/lock.h>
   71 #include <sys/malloc.h>
   72 #include <sys/mman.h>
   73 #include <sys/mutex.h>
   74 #include <sys/priv.h>
   75 #include <sys/proc.h>
   76 #include <sys/refcount.h>
   77 #include <sys/resourcevar.h>
   78 #include <sys/rwlock.h>
   79 #include <sys/sbuf.h>
   80 #include <sys/stat.h>
   81 #include <sys/syscallsubr.h>
   82 #include <sys/sysctl.h>
   83 #include <sys/sysproto.h>
   84 #include <sys/systm.h>
   85 #include <sys/sx.h>
   86 #include <sys/time.h>
   87 #include <sys/vmmeter.h>
   88 #include <sys/vnode.h>
   89 #include <sys/unistd.h>
   90 #include <sys/user.h>
   91 
   92 #include <security/audit/audit.h>
   93 #include <security/mac/mac_framework.h>
   94 
   95 #include <vm/vm.h>
   96 #include <vm/vm_param.h>
   97 #include <vm/pmap.h>
   98 #include <vm/vm_extern.h>
   99 #include <vm/vm_map.h>
  100 #include <vm/vm_kern.h>
  101 #include <vm/vm_object.h>
  102 #include <vm/vm_page.h>
  103 #include <vm/vm_pageout.h>
  104 #include <vm/vm_pager.h>
  105 #include <vm/swap_pager.h>
  106 
  107 struct shm_mapping {
  108         char            *sm_path;
  109         Fnv32_t         sm_fnv;
  110         struct shmfd    *sm_shmfd;
  111         LIST_ENTRY(shm_mapping) sm_link;
  112 };
  113 
  114 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
  115 static LIST_HEAD(, shm_mapping) *shm_dictionary;
  116 static struct sx shm_dict_lock;
  117 static struct mtx shm_timestamp_lock;
  118 static u_long shm_hash;
  119 static struct unrhdr64 shm_ino_unr;
  120 static dev_t shm_dev_ino;
  121 
  122 #define SHM_HASH(fnv)   (&shm_dictionary[(fnv) & shm_hash])
  123 
  124 static void     shm_init(void *arg);
  125 static void     shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
  126 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
  127 static int      shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
  128 static void     shm_doremove(struct shm_mapping *map);
  129 static int      shm_dotruncate_cookie(struct shmfd *shmfd, off_t length,
  130     void *rl_cookie);
  131 static int      shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
  132     void *rl_cookie);
  133 static int      shm_copyin_path(struct thread *td, const char *userpath_in,
  134     char **path_out);
  135 
  136 static fo_rdwr_t        shm_read;
  137 static fo_rdwr_t        shm_write;
  138 static fo_truncate_t    shm_truncate;
  139 static fo_ioctl_t       shm_ioctl;
  140 static fo_stat_t        shm_stat;
  141 static fo_close_t       shm_close;
  142 static fo_chmod_t       shm_chmod;
  143 static fo_chown_t       shm_chown;
  144 static fo_seek_t        shm_seek;
  145 static fo_fill_kinfo_t  shm_fill_kinfo;
  146 static fo_mmap_t        shm_mmap;
  147 static fo_get_seals_t   shm_get_seals;
  148 static fo_add_seals_t   shm_add_seals;
  149 static fo_fallocate_t   shm_fallocate;
  150 
  151 /* File descriptor operations. */
  152 struct fileops shm_ops = {
  153         .fo_read = shm_read,
  154         .fo_write = shm_write,
  155         .fo_truncate = shm_truncate,
  156         .fo_ioctl = shm_ioctl,
  157         .fo_poll = invfo_poll,
  158         .fo_kqfilter = invfo_kqfilter,
  159         .fo_stat = shm_stat,
  160         .fo_close = shm_close,
  161         .fo_chmod = shm_chmod,
  162         .fo_chown = shm_chown,
  163         .fo_sendfile = vn_sendfile,
  164         .fo_seek = shm_seek,
  165         .fo_fill_kinfo = shm_fill_kinfo,
  166         .fo_mmap = shm_mmap,
  167         .fo_get_seals = shm_get_seals,
  168         .fo_add_seals = shm_add_seals,
  169         .fo_fallocate = shm_fallocate,
  170         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
  171 };
  172 
  173 FEATURE(posix_shm, "POSIX shared memory");
  174 
  175 static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  176     "");
  177 
  178 static int largepage_reclaim_tries = 1;
  179 SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
  180     CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
  181     "Number of contig reclaims before giving up for default alloc policy");
  182 
  183 static int
  184 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
  185 {
  186         vm_page_t m;
  187         vm_pindex_t idx;
  188         size_t tlen;
  189         int error, offset, rv;
  190 
  191         idx = OFF_TO_IDX(uio->uio_offset);
  192         offset = uio->uio_offset & PAGE_MASK;
  193         tlen = MIN(PAGE_SIZE - offset, len);
  194 
  195         rv = vm_page_grab_valid_unlocked(&m, obj, idx,
  196             VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
  197         if (rv == VM_PAGER_OK)
  198                 goto found;
  199 
  200         /*
  201          * Read I/O without either a corresponding resident page or swap
  202          * page: use zero_region.  This is intended to avoid instantiating
  203          * pages on read from a sparse region.
  204          */
  205         VM_OBJECT_WLOCK(obj);
  206         m = vm_page_lookup(obj, idx);
  207         if (uio->uio_rw == UIO_READ && m == NULL &&
  208             !vm_pager_has_page(obj, idx, NULL, NULL)) {
  209                 VM_OBJECT_WUNLOCK(obj);
  210                 return (uiomove(__DECONST(void *, zero_region), tlen, uio));
  211         }
  212 
  213         /*
  214          * Although the tmpfs vnode lock is held here, it is
  215          * nonetheless safe to sleep waiting for a free page.  The
  216          * pageout daemon does not need to acquire the tmpfs vnode
  217          * lock to page out tobj's pages because tobj is a OBJT_SWAP
  218          * type object.
  219          */
  220         rv = vm_page_grab_valid(&m, obj, idx,
  221             VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
  222         if (rv != VM_PAGER_OK) {
  223                 VM_OBJECT_WUNLOCK(obj);
  224                 if (bootverbose) {
  225                         printf("uiomove_object: vm_obj %p idx %jd "
  226                             "pager error %d\n", obj, idx, rv);
  227                 }
  228                 return (rv == VM_PAGER_AGAIN ? ENOSPC : EIO);
  229         }
  230         VM_OBJECT_WUNLOCK(obj);
  231 
  232 found:
  233         error = uiomove_fromphys(&m, offset, tlen, uio);
  234         if (uio->uio_rw == UIO_WRITE && error == 0)
  235                 vm_page_set_dirty(m);
  236         vm_page_activate(m);
  237         vm_page_sunbusy(m);
  238 
  239         return (error);
  240 }
  241 
  242 int
  243 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
  244 {
  245         ssize_t resid;
  246         size_t len;
  247         int error;
  248 
  249         error = 0;
  250         while ((resid = uio->uio_resid) > 0) {
  251                 if (obj_size <= uio->uio_offset)
  252                         break;
  253                 len = MIN(obj_size - uio->uio_offset, resid);
  254                 if (len == 0)
  255                         break;
  256                 error = uiomove_object_page(obj, len, uio);
  257                 if (error != 0 || resid == uio->uio_resid)
  258                         break;
  259         }
  260         return (error);
  261 }
  262 
  263 static u_long count_largepages[MAXPAGESIZES];
  264 
  265 static int
  266 shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
  267     int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
  268 {
  269         vm_page_t m __diagused;
  270         int psind;
  271 
  272         psind = object->un_pager.phys.data_val;
  273         if (psind == 0 || pidx >= object->size)
  274                 return (VM_PAGER_FAIL);
  275         *first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE);
  276 
  277         /*
  278          * We only busy the first page in the superpage run.  It is
  279          * useless to busy whole run since we only remove full
  280          * superpage, and it takes too long to busy e.g. 512 * 512 ==
  281          * 262144 pages constituing 1G amd64 superage.
  282          */
  283         m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT);
  284         MPASS(m != NULL);
  285 
  286         *last = *first + atop(pagesizes[psind]) - 1;
  287         return (VM_PAGER_OK);
  288 }
  289 
  290 static boolean_t
  291 shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
  292     int *before, int *after)
  293 {
  294         int psind;
  295 
  296         psind = object->un_pager.phys.data_val;
  297         if (psind == 0 || pindex >= object->size)
  298                 return (FALSE);
  299         if (before != NULL) {
  300                 *before = pindex - rounddown2(pindex, pagesizes[psind] /
  301                     PAGE_SIZE);
  302         }
  303         if (after != NULL) {
  304                 *after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) -
  305                     pindex;
  306         }
  307         return (TRUE);
  308 }
  309 
  310 static void
  311 shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
  312     vm_ooffset_t foff, struct ucred *cred)
  313 {
  314 }
  315 
  316 static void
  317 shm_largepage_phys_dtor(vm_object_t object)
  318 {
  319         int psind;
  320 
  321         psind = object->un_pager.phys.data_val;
  322         if (psind != 0) {
  323                 atomic_subtract_long(&count_largepages[psind],
  324                     object->size / (pagesizes[psind] / PAGE_SIZE));
  325                 vm_wire_sub(object->size);
  326         } else {
  327                 KASSERT(object->size == 0,
  328                     ("largepage phys obj %p not initialized bit size %#jx > 0",
  329                     object, (uintmax_t)object->size));
  330         }
  331 }
  332 
  333 static const struct phys_pager_ops shm_largepage_phys_ops = {
  334         .phys_pg_populate =     shm_largepage_phys_populate,
  335         .phys_pg_haspage =      shm_largepage_phys_haspage,
  336         .phys_pg_ctor =         shm_largepage_phys_ctor,
  337         .phys_pg_dtor =         shm_largepage_phys_dtor,
  338 };
  339 
  340 bool
  341 shm_largepage(struct shmfd *shmfd)
  342 {
  343         return (shmfd->shm_object->type == OBJT_PHYS);
  344 }
  345 
  346 static void
  347 shm_pager_freespace(vm_object_t obj, vm_pindex_t start, vm_size_t size)
  348 {
  349         struct shmfd *shm;
  350         vm_size_t c;
  351 
  352         swap_pager_freespace(obj, start, size, &c);
  353         if (c == 0)
  354                 return;
  355 
  356         shm = obj->un_pager.swp.swp_priv;
  357         if (shm == NULL)
  358                 return;
  359         KASSERT(shm->shm_pages >= c,
  360             ("shm %p pages %jd free %jd", shm,
  361             (uintmax_t)shm->shm_pages, (uintmax_t)c));
  362         shm->shm_pages -= c;
  363 }
  364 
  365 static void
  366 shm_page_inserted(vm_object_t obj, vm_page_t m)
  367 {
  368         struct shmfd *shm;
  369 
  370         shm = obj->un_pager.swp.swp_priv;
  371         if (shm == NULL)
  372                 return;
  373         if (!vm_pager_has_page(obj, m->pindex, NULL, NULL))
  374                 shm->shm_pages += 1;
  375 }
  376 
  377 static void
  378 shm_page_removed(vm_object_t obj, vm_page_t m)
  379 {
  380         struct shmfd *shm;
  381 
  382         shm = obj->un_pager.swp.swp_priv;
  383         if (shm == NULL)
  384                 return;
  385         if (!vm_pager_has_page(obj, m->pindex, NULL, NULL)) {
  386                 KASSERT(shm->shm_pages >= 1,
  387                     ("shm %p pages %jd free 1", shm,
  388                     (uintmax_t)shm->shm_pages));
  389                 shm->shm_pages -= 1;
  390         }
  391 }
  392 
  393 static struct pagerops shm_swap_pager_ops = {
  394         .pgo_kvme_type = KVME_TYPE_SWAP,
  395         .pgo_freespace = shm_pager_freespace,
  396         .pgo_page_inserted = shm_page_inserted,
  397         .pgo_page_removed = shm_page_removed,
  398 };
  399 static int shmfd_pager_type = -1;
  400 
  401 static int
  402 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
  403 {
  404         struct shmfd *shmfd;
  405         off_t foffset;
  406         int error;
  407 
  408         shmfd = fp->f_data;
  409         foffset = foffset_lock(fp, 0);
  410         error = 0;
  411         switch (whence) {
  412         case L_INCR:
  413                 if (foffset < 0 ||
  414                     (offset > 0 && foffset > OFF_MAX - offset)) {
  415                         error = EOVERFLOW;
  416                         break;
  417                 }
  418                 offset += foffset;
  419                 break;
  420         case L_XTND:
  421                 if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
  422                         error = EOVERFLOW;
  423                         break;
  424                 }
  425                 offset += shmfd->shm_size;
  426                 break;
  427         case L_SET:
  428                 break;
  429         default:
  430                 error = EINVAL;
  431         }
  432         if (error == 0) {
  433                 if (offset < 0 || offset > shmfd->shm_size)
  434                         error = EINVAL;
  435                 else
  436                         td->td_uretoff.tdu_off = offset;
  437         }
  438         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
  439         return (error);
  440 }
  441 
  442 static int
  443 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
  444     int flags, struct thread *td)
  445 {
  446         struct shmfd *shmfd;
  447         void *rl_cookie;
  448         int error;
  449 
  450         shmfd = fp->f_data;
  451 #ifdef MAC
  452         error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
  453         if (error)
  454                 return (error);
  455 #endif
  456         foffset_lock_uio(fp, uio, flags);
  457         rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
  458             uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
  459         error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
  460         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
  461         foffset_unlock_uio(fp, uio, flags);
  462         return (error);
  463 }
  464 
  465 static int
  466 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
  467     int flags, struct thread *td)
  468 {
  469         struct shmfd *shmfd;
  470         void *rl_cookie;
  471         int error;
  472         off_t size;
  473 
  474         shmfd = fp->f_data;
  475 #ifdef MAC
  476         error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
  477         if (error)
  478                 return (error);
  479 #endif
  480         if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
  481                 return (EINVAL);
  482         foffset_lock_uio(fp, uio, flags);
  483         if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
  484                 /*
  485                  * Overflow is only an error if we're supposed to expand on
  486                  * write.  Otherwise, we'll just truncate the write to the
  487                  * size of the file, which can only grow up to OFF_MAX.
  488                  */
  489                 if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) {
  490                         foffset_unlock_uio(fp, uio, flags);
  491                         return (EFBIG);
  492                 }
  493 
  494                 size = shmfd->shm_size;
  495         } else {
  496                 size = uio->uio_offset + uio->uio_resid;
  497         }
  498         if ((flags & FOF_OFFSET) == 0) {
  499                 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
  500                     &shmfd->shm_mtx);
  501         } else {
  502                 rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
  503                     size, &shmfd->shm_mtx);
  504         }
  505         if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
  506                 error = EPERM;
  507         } else {
  508                 error = 0;
  509                 if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 &&
  510                     size > shmfd->shm_size) {
  511                         error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
  512                 }
  513                 if (error == 0)
  514                         error = uiomove_object(shmfd->shm_object,
  515                             shmfd->shm_size, uio);
  516         }
  517         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
  518         foffset_unlock_uio(fp, uio, flags);
  519         return (error);
  520 }
  521 
  522 static int
  523 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
  524     struct thread *td)
  525 {
  526         struct shmfd *shmfd;
  527 #ifdef MAC
  528         int error;
  529 #endif
  530 
  531         shmfd = fp->f_data;
  532 #ifdef MAC
  533         error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
  534         if (error)
  535                 return (error);
  536 #endif
  537         return (shm_dotruncate(shmfd, length));
  538 }
  539 
  540 int
  541 shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
  542     struct thread *td)
  543 {
  544         struct shmfd *shmfd;
  545         struct shm_largepage_conf *conf;
  546         void *rl_cookie;
  547 
  548         shmfd = fp->f_data;
  549         switch (com) {
  550         case FIONBIO:
  551         case FIOASYNC:
  552                 /*
  553                  * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
  554                  * just like it would on an unlinked regular file
  555                  */
  556                 return (0);
  557         case FIOSSHMLPGCNF:
  558                 if (!shm_largepage(shmfd))
  559                         return (ENOTTY);
  560                 conf = data;
  561                 if (shmfd->shm_lp_psind != 0 &&
  562                     conf->psind != shmfd->shm_lp_psind)
  563                         return (EINVAL);
  564                 if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
  565                     pagesizes[conf->psind] == 0)
  566                         return (EINVAL);
  567                 if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
  568                     conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
  569                     conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
  570                         return (EINVAL);
  571 
  572                 rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
  573                     &shmfd->shm_mtx);
  574                 shmfd->shm_lp_psind = conf->psind;
  575                 shmfd->shm_lp_alloc_policy = conf->alloc_policy;
  576                 shmfd->shm_object->un_pager.phys.data_val = conf->psind;
  577                 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
  578                 return (0);
  579         case FIOGSHMLPGCNF:
  580                 if (!shm_largepage(shmfd))
  581                         return (ENOTTY);
  582                 conf = data;
  583                 rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX,
  584                     &shmfd->shm_mtx);
  585                 conf->psind = shmfd->shm_lp_psind;
  586                 conf->alloc_policy = shmfd->shm_lp_alloc_policy;
  587                 rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
  588                 return (0);
  589         default:
  590                 return (ENOTTY);
  591         }
  592 }
  593 
  594 static int
  595 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
  596     struct thread *td)
  597 {
  598         struct shmfd *shmfd;
  599 #ifdef MAC
  600         int error;
  601 #endif
  602 
  603         shmfd = fp->f_data;
  604 
  605 #ifdef MAC
  606         error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
  607         if (error)
  608                 return (error);
  609 #endif
  610 
  611         /*
  612          * Attempt to return sanish values for fstat() on a memory file
  613          * descriptor.
  614          */
  615         bzero(sb, sizeof(*sb));
  616         sb->st_blksize = PAGE_SIZE;
  617         sb->st_size = shmfd->shm_size;
  618         mtx_lock(&shm_timestamp_lock);
  619         sb->st_atim = shmfd->shm_atime;
  620         sb->st_ctim = shmfd->shm_ctime;
  621         sb->st_mtim = shmfd->shm_mtime;
  622         sb->st_birthtim = shmfd->shm_birthtime;
  623         sb->st_mode = S_IFREG | shmfd->shm_mode;                /* XXX */
  624         sb->st_uid = shmfd->shm_uid;
  625         sb->st_gid = shmfd->shm_gid;
  626         mtx_unlock(&shm_timestamp_lock);
  627         sb->st_dev = shm_dev_ino;
  628         sb->st_ino = shmfd->shm_ino;
  629         sb->st_nlink = shmfd->shm_object->ref_count;
  630         if (shm_largepage(shmfd)) {
  631                 sb->st_blocks = shmfd->shm_object->size /
  632                     (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
  633         } else {
  634                 sb->st_blocks = shmfd->shm_pages;
  635         }
  636 
  637         return (0);
  638 }
  639 
  640 static int
  641 shm_close(struct file *fp, struct thread *td)
  642 {
  643         struct shmfd *shmfd;
  644 
  645         shmfd = fp->f_data;
  646         fp->f_data = NULL;
  647         shm_drop(shmfd);
  648 
  649         return (0);
  650 }
  651 
  652 static int
  653 shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
  654         int error;
  655         char *path;
  656         const char *pr_path;
  657         size_t pr_pathlen;
  658 
  659         path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
  660         pr_path = td->td_ucred->cr_prison->pr_path;
  661 
  662         /* Construct a full pathname for jailed callers. */
  663         pr_pathlen = strcmp(pr_path, "/") ==
  664             0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
  665         error = copyinstr(userpath_in, path + pr_pathlen,
  666             MAXPATHLEN - pr_pathlen, NULL);
  667         if (error != 0)
  668                 goto out;
  669 
  670 #ifdef KTRACE
  671         if (KTRPOINT(curthread, KTR_NAMEI))
  672                 ktrnamei(path);
  673 #endif
  674 
  675         /* Require paths to start with a '/' character. */
  676         if (path[pr_pathlen] != '/') {
  677                 error = EINVAL;
  678                 goto out;
  679         }
  680 
  681         *path_out = path;
  682 
  683 out:
  684         if (error != 0)
  685                 free(path, M_SHMFD);
  686 
  687         return (error);
  688 }
  689 
  690 static int
  691 shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
  692 {
  693         vm_object_t object;
  694         vm_page_t m;
  695         vm_pindex_t idx, nobjsize;
  696         vm_ooffset_t delta;
  697         int base, rv;
  698 
  699         KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
  700         object = shmfd->shm_object;
  701         VM_OBJECT_ASSERT_WLOCKED(object);
  702         rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
  703         if (length == shmfd->shm_size)
  704                 return (0);
  705         nobjsize = OFF_TO_IDX(length + PAGE_MASK);
  706 
  707         /* Are we shrinking?  If so, trim the end. */
  708         if (length < shmfd->shm_size) {
  709                 if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
  710                         return (EPERM);
  711 
  712                 /*
  713                  * Disallow any requests to shrink the size if this
  714                  * object is mapped into the kernel.
  715                  */
  716                 if (shmfd->shm_kmappings > 0)
  717                         return (EBUSY);
  718 
  719                 /*
  720                  * Zero the truncated part of the last page.
  721                  */
  722                 base = length & PAGE_MASK;
  723                 if (base != 0) {
  724                         idx = OFF_TO_IDX(length);
  725 retry:
  726                         m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
  727                         if (m != NULL) {
  728                                 MPASS(vm_page_all_valid(m));
  729                         } else if (vm_pager_has_page(object, idx, NULL, NULL)) {
  730                                 m = vm_page_alloc(object, idx,
  731                                     VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
  732                                 if (m == NULL)
  733                                         goto retry;
  734                                 vm_object_pip_add(object, 1);
  735                                 VM_OBJECT_WUNLOCK(object);
  736                                 rv = vm_pager_get_pages(object, &m, 1, NULL,
  737                                     NULL);
  738                                 VM_OBJECT_WLOCK(object);
  739                                 vm_object_pip_wakeup(object);
  740                                 if (rv == VM_PAGER_OK) {
  741                                         /*
  742                                          * Since the page was not resident,
  743                                          * and therefore not recently
  744                                          * accessed, immediately enqueue it
  745                                          * for asynchronous laundering.  The
  746                                          * current operation is not regarded
  747                                          * as an access.
  748                                          */
  749                                         vm_page_launder(m);
  750                                 } else {
  751                                         vm_page_free(m);
  752                                         VM_OBJECT_WUNLOCK(object);
  753                                         return (EIO);
  754                                 }
  755                         }
  756                         if (m != NULL) {
  757                                 pmap_zero_page_area(m, base, PAGE_SIZE - base);
  758                                 KASSERT(vm_page_all_valid(m),
  759                                     ("shm_dotruncate: page %p is invalid", m));
  760                                 vm_page_set_dirty(m);
  761                                 vm_page_xunbusy(m);
  762                         }
  763                 }
  764                 delta = IDX_TO_OFF(object->size - nobjsize);
  765 
  766                 if (nobjsize < object->size)
  767                         vm_object_page_remove(object, nobjsize, object->size,
  768                             0);
  769 
  770                 /* Free the swap accounted for shm */
  771                 swap_release_by_cred(delta, object->cred);
  772                 object->charge -= delta;
  773         } else {
  774                 if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
  775                         return (EPERM);
  776 
  777                 /* Try to reserve additional swap space. */
  778                 delta = IDX_TO_OFF(nobjsize - object->size);
  779                 if (!swap_reserve_by_cred(delta, object->cred))
  780                         return (ENOMEM);
  781                 object->charge += delta;
  782         }
  783         shmfd->shm_size = length;
  784         mtx_lock(&shm_timestamp_lock);
  785         vfs_timestamp(&shmfd->shm_ctime);
  786         shmfd->shm_mtime = shmfd->shm_ctime;
  787         mtx_unlock(&shm_timestamp_lock);
  788         object->size = nobjsize;
  789         return (0);
  790 }
  791 
  792 static int
  793 shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
  794 {
  795         vm_object_t object;
  796         vm_page_t m;
  797         vm_pindex_t newobjsz;
  798         vm_pindex_t oldobjsz __unused;
  799         int aflags, error, i, psind, try;
  800 
  801         KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
  802         object = shmfd->shm_object;
  803         VM_OBJECT_ASSERT_WLOCKED(object);
  804         rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
  805 
  806         oldobjsz = object->size;
  807         newobjsz = OFF_TO_IDX(length);
  808         if (length == shmfd->shm_size)
  809                 return (0);
  810         psind = shmfd->shm_lp_psind;
  811         if (psind == 0 && length != 0)
  812                 return (EINVAL);
  813         if ((length & (pagesizes[psind] - 1)) != 0)
  814                 return (EINVAL);
  815 
  816         if (length < shmfd->shm_size) {
  817                 if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
  818                         return (EPERM);
  819                 if (shmfd->shm_kmappings > 0)
  820                         return (EBUSY);
  821                 return (ENOTSUP);       /* Pages are unmanaged. */
  822 #if 0
  823                 vm_object_page_remove(object, newobjsz, oldobjsz, 0);
  824                 object->size = newobjsz;
  825                 shmfd->shm_size = length;
  826                 return (0);
  827 #endif
  828         }
  829 
  830         if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
  831                 return (EPERM);
  832 
  833         aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
  834         if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
  835                 aflags |= VM_ALLOC_WAITFAIL;
  836         try = 0;
  837 
  838         /*
  839          * Extend shmfd and object, keeping all already fully
  840          * allocated large pages intact even on error, because dropped
  841          * object lock might allowed mapping of them.
  842          */
  843         while (object->size < newobjsz) {
  844                 m = vm_page_alloc_contig(object, object->size, aflags,
  845                     pagesizes[psind] / PAGE_SIZE, 0, ~0,
  846                     pagesizes[psind], 0,
  847                     VM_MEMATTR_DEFAULT);
  848                 if (m == NULL) {
  849                         VM_OBJECT_WUNLOCK(object);
  850                         if (shmfd->shm_lp_alloc_policy ==
  851                             SHM_LARGEPAGE_ALLOC_NOWAIT ||
  852                             (shmfd->shm_lp_alloc_policy ==
  853                             SHM_LARGEPAGE_ALLOC_DEFAULT &&
  854                             try >= largepage_reclaim_tries)) {
  855                                 VM_OBJECT_WLOCK(object);
  856                                 return (ENOMEM);
  857                         }
  858                         error = vm_page_reclaim_contig(aflags,
  859                             pagesizes[psind] / PAGE_SIZE, 0, ~0,
  860                             pagesizes[psind], 0) ? 0 :
  861                             vm_wait_intr(object);
  862                         if (error != 0) {
  863                                 VM_OBJECT_WLOCK(object);
  864                                 return (error);
  865                         }
  866                         try++;
  867                         VM_OBJECT_WLOCK(object);
  868                         continue;
  869                 }
  870                 try = 0;
  871                 for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) {
  872                         if ((m[i].flags & PG_ZERO) == 0)
  873                                 pmap_zero_page(&m[i]);
  874                         vm_page_valid(&m[i]);
  875                         vm_page_xunbusy(&m[i]);
  876                 }
  877                 object->size += OFF_TO_IDX(pagesizes[psind]);
  878                 shmfd->shm_size += pagesizes[psind];
  879                 atomic_add_long(&count_largepages[psind], 1);
  880                 vm_wire_add(atop(pagesizes[psind]));
  881         }
  882         return (0);
  883 }
  884 
  885 static int
  886 shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie)
  887 {
  888         int error;
  889 
  890         VM_OBJECT_WLOCK(shmfd->shm_object);
  891         error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd,
  892             length, rl_cookie) : shm_dotruncate_locked(shmfd, length,
  893             rl_cookie);
  894         VM_OBJECT_WUNLOCK(shmfd->shm_object);
  895         return (error);
  896 }
  897 
  898 int
  899 shm_dotruncate(struct shmfd *shmfd, off_t length)
  900 {
  901         void *rl_cookie;
  902         int error;
  903 
  904         rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
  905             &shmfd->shm_mtx);
  906         error = shm_dotruncate_cookie(shmfd, length, rl_cookie);
  907         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
  908         return (error);
  909 }
  910 
  911 /*
  912  * shmfd object management including creation and reference counting
  913  * routines.
  914  */
  915 struct shmfd *
  916 shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
  917 {
  918         struct shmfd *shmfd;
  919         vm_object_t obj;
  920 
  921         shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
  922         shmfd->shm_size = 0;
  923         shmfd->shm_uid = ucred->cr_uid;
  924         shmfd->shm_gid = ucred->cr_gid;
  925         shmfd->shm_mode = mode;
  926         if (largepage) {
  927                 shmfd->shm_object = phys_pager_allocate(NULL,
  928                     &shm_largepage_phys_ops, NULL, shmfd->shm_size,
  929                     VM_PROT_DEFAULT, 0, ucred);
  930                 shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
  931         } else {
  932                 obj = vm_pager_allocate(shmfd_pager_type, NULL,
  933                     shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
  934                 VM_OBJECT_WLOCK(obj);
  935                 obj->un_pager.swp.swp_priv = shmfd;
  936                 VM_OBJECT_WUNLOCK(obj);
  937                 shmfd->shm_object = obj;
  938         }
  939         KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
  940         vfs_timestamp(&shmfd->shm_birthtime);
  941         shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
  942             shmfd->shm_birthtime;
  943         shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
  944         refcount_init(&shmfd->shm_refs, 1);
  945         mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
  946         rangelock_init(&shmfd->shm_rl);
  947 #ifdef MAC
  948         mac_posixshm_init(shmfd);
  949         mac_posixshm_create(ucred, shmfd);
  950 #endif
  951 
  952         return (shmfd);
  953 }
  954 
  955 struct shmfd *
  956 shm_hold(struct shmfd *shmfd)
  957 {
  958 
  959         refcount_acquire(&shmfd->shm_refs);
  960         return (shmfd);
  961 }
  962 
  963 void
  964 shm_drop(struct shmfd *shmfd)
  965 {
  966         vm_object_t obj;
  967 
  968         if (refcount_release(&shmfd->shm_refs)) {
  969 #ifdef MAC
  970                 mac_posixshm_destroy(shmfd);
  971 #endif
  972                 rangelock_destroy(&shmfd->shm_rl);
  973                 mtx_destroy(&shmfd->shm_mtx);
  974                 obj = shmfd->shm_object;
  975                 if (!shm_largepage(shmfd)) {
  976                         VM_OBJECT_WLOCK(obj);
  977                         obj->un_pager.swp.swp_priv = NULL;
  978                         VM_OBJECT_WUNLOCK(obj);
  979                 }
  980                 vm_object_deallocate(obj);
  981                 free(shmfd, M_SHMFD);
  982         }
  983 }
  984 
  985 /*
  986  * Determine if the credentials have sufficient permissions for a
  987  * specified combination of FREAD and FWRITE.
  988  */
  989 int
  990 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
  991 {
  992         accmode_t accmode;
  993         int error;
  994 
  995         accmode = 0;
  996         if (flags & FREAD)
  997                 accmode |= VREAD;
  998         if (flags & FWRITE)
  999                 accmode |= VWRITE;
 1000         mtx_lock(&shm_timestamp_lock);
 1001         error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 1002             accmode, ucred);
 1003         mtx_unlock(&shm_timestamp_lock);
 1004         return (error);
 1005 }
 1006 
 1007 static void
 1008 shm_init(void *arg)
 1009 {
 1010         char name[32];
 1011         int i;
 1012 
 1013         mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 1014         sx_init(&shm_dict_lock, "shm dictionary");
 1015         shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 1016         new_unrhdr64(&shm_ino_unr, 1);
 1017         shm_dev_ino = devfs_alloc_cdp_inode();
 1018         KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 1019         shmfd_pager_type = vm_pager_alloc_dyn_type(&shm_swap_pager_ops,
 1020             OBJT_SWAP);
 1021         MPASS(shmfd_pager_type != -1);
 1022 
 1023         for (i = 1; i < MAXPAGESIZES; i++) {
 1024                 if (pagesizes[i] == 0)
 1025                         break;
 1026 #define M       (1024 * 1024)
 1027 #define G       (1024 * M)
 1028                 if (pagesizes[i] >= G)
 1029                         snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
 1030                 else if (pagesizes[i] >= M)
 1031                         snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
 1032                 else
 1033                         snprintf(name, sizeof(name), "%lu", pagesizes[i]);
 1034 #undef G
 1035 #undef M
 1036                 SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
 1037                     OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
 1038                     "number of non-transient largepages allocated");
 1039         }
 1040 }
 1041 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 1042 
 1043 /*
 1044  * Remove all shared memory objects that belong to a prison.
 1045  */
 1046 void
 1047 shm_remove_prison(struct prison *pr)
 1048 {
 1049         struct shm_mapping *shmm, *tshmm;
 1050         u_long i;
 1051 
 1052         sx_xlock(&shm_dict_lock);
 1053         for (i = 0; i < shm_hash + 1; i++) {
 1054                 LIST_FOREACH_SAFE(shmm, &shm_dictionary[i], sm_link, tshmm) {
 1055                         if (shmm->sm_shmfd->shm_object->cred &&
 1056                             shmm->sm_shmfd->shm_object->cred->cr_prison == pr)
 1057                                 shm_doremove(shmm);
 1058                 }
 1059         }
 1060         sx_xunlock(&shm_dict_lock);
 1061 }
 1062 
 1063 /*
 1064  * Dictionary management.  We maintain an in-kernel dictionary to map
 1065  * paths to shmfd objects.  We use the FNV hash on the path to store
 1066  * the mappings in a hash table.
 1067  */
 1068 static struct shmfd *
 1069 shm_lookup(char *path, Fnv32_t fnv)
 1070 {
 1071         struct shm_mapping *map;
 1072 
 1073         LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 1074                 if (map->sm_fnv != fnv)
 1075                         continue;
 1076                 if (strcmp(map->sm_path, path) == 0)
 1077                         return (map->sm_shmfd);
 1078         }
 1079 
 1080         return (NULL);
 1081 }
 1082 
 1083 static void
 1084 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 1085 {
 1086         struct shm_mapping *map;
 1087 
 1088         map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 1089         map->sm_path = path;
 1090         map->sm_fnv = fnv;
 1091         map->sm_shmfd = shm_hold(shmfd);
 1092         shmfd->shm_path = path;
 1093         LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 1094 }
 1095 
 1096 static int
 1097 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 1098 {
 1099         struct shm_mapping *map;
 1100         int error;
 1101 
 1102         LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 1103                 if (map->sm_fnv != fnv)
 1104                         continue;
 1105                 if (strcmp(map->sm_path, path) == 0) {
 1106 #ifdef MAC
 1107                         error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 1108                         if (error)
 1109                                 return (error);
 1110 #endif
 1111                         error = shm_access(map->sm_shmfd, ucred,
 1112                             FREAD | FWRITE);
 1113                         if (error)
 1114                                 return (error);
 1115                         shm_doremove(map);
 1116                         return (0);
 1117                 }
 1118         }
 1119 
 1120         return (ENOENT);
 1121 }
 1122 
 1123 static void
 1124 shm_doremove(struct shm_mapping *map)
 1125 {
 1126         map->sm_shmfd->shm_path = NULL;
 1127         LIST_REMOVE(map, sm_link);
 1128         shm_drop(map->sm_shmfd);
 1129         free(map->sm_path, M_SHMFD);
 1130         free(map, M_SHMFD);
 1131 }
 1132 
 1133 int
 1134 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
 1135     int shmflags, struct filecaps *fcaps, const char *name __unused)
 1136 {
 1137         struct pwddesc *pdp;
 1138         struct shmfd *shmfd;
 1139         struct file *fp;
 1140         char *path;
 1141         void *rl_cookie;
 1142         Fnv32_t fnv;
 1143         mode_t cmode;
 1144         int error, fd, initial_seals;
 1145         bool largepage;
 1146 
 1147         if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE |
 1148             SHM_LARGEPAGE)) != 0)
 1149                 return (EINVAL);
 1150 
 1151         initial_seals = F_SEAL_SEAL;
 1152         if ((shmflags & SHM_ALLOW_SEALING) != 0)
 1153                 initial_seals &= ~F_SEAL_SEAL;
 1154 
 1155 #ifdef CAPABILITY_MODE
 1156         /*
 1157          * shm_open(2) is only allowed for anonymous objects.
 1158          */
 1159         if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 1160                 return (ECAPMODE);
 1161 #endif
 1162 
 1163         AUDIT_ARG_FFLAGS(flags);
 1164         AUDIT_ARG_MODE(mode);
 1165 
 1166         if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 1167                 return (EINVAL);
 1168 
 1169         if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 1170                 return (EINVAL);
 1171 
 1172         largepage = (shmflags & SHM_LARGEPAGE) != 0;
 1173         if (largepage && !PMAP_HAS_LARGEPAGES)
 1174                 return (ENOTTY);
 1175 
 1176         /*
 1177          * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
 1178          * If the decision is made later to allow additional seals, care must be
 1179          * taken below to ensure that the seals are properly set if the shmfd
 1180          * already existed -- this currently assumes that only F_SEAL_SEAL can
 1181          * be set and doesn't take further precautions to ensure the validity of
 1182          * the seals being added with respect to current mappings.
 1183          */
 1184         if ((initial_seals & ~F_SEAL_SEAL) != 0)
 1185                 return (EINVAL);
 1186 
 1187         pdp = td->td_proc->p_pd;
 1188         cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 1189 
 1190         /*
 1191          * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
 1192          * by POSIX.  We allow it to be unset here so that an in-kernel
 1193          * interface may be written as a thin layer around shm, optionally not
 1194          * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
 1195          * in sys_shm_open() to keep this implementation compliant.
 1196          */
 1197         error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
 1198         if (error)
 1199                 return (error);
 1200 
 1201         /* A SHM_ANON path pointer creates an anonymous object. */
 1202         if (userpath == SHM_ANON) {
 1203                 /* A read-only anonymous object is pointless. */
 1204                 if ((flags & O_ACCMODE) == O_RDONLY) {
 1205                         fdclose(td, fp, fd);
 1206                         fdrop(fp, td);
 1207                         return (EINVAL);
 1208                 }
 1209                 shmfd = shm_alloc(td->td_ucred, cmode, largepage);
 1210                 shmfd->shm_seals = initial_seals;
 1211                 shmfd->shm_flags = shmflags;
 1212         } else {
 1213                 error = shm_copyin_path(td, userpath, &path);
 1214                 if (error != 0) {
 1215                         fdclose(td, fp, fd);
 1216                         fdrop(fp, td);
 1217                         return (error);
 1218                 }
 1219 
 1220                 AUDIT_ARG_UPATH1_CANON(path);
 1221                 fnv = fnv_32_str(path, FNV1_32_INIT);
 1222                 sx_xlock(&shm_dict_lock);
 1223                 shmfd = shm_lookup(path, fnv);
 1224                 if (shmfd == NULL) {
 1225                         /* Object does not yet exist, create it if requested. */
 1226                         if (flags & O_CREAT) {
 1227 #ifdef MAC
 1228                                 error = mac_posixshm_check_create(td->td_ucred,
 1229                                     path);
 1230                                 if (error == 0) {
 1231 #endif
 1232                                         shmfd = shm_alloc(td->td_ucred, cmode,
 1233                                             largepage);
 1234                                         shmfd->shm_seals = initial_seals;
 1235                                         shmfd->shm_flags = shmflags;
 1236                                         shm_insert(path, fnv, shmfd);
 1237 #ifdef MAC
 1238                                 }
 1239 #endif
 1240                         } else {
 1241                                 free(path, M_SHMFD);
 1242                                 error = ENOENT;
 1243                         }
 1244                 } else {
 1245                         rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 1246                             &shmfd->shm_mtx);
 1247 
 1248                         /*
 1249                          * kern_shm_open() likely shouldn't ever error out on
 1250                          * trying to set a seal that already exists, unlike
 1251                          * F_ADD_SEALS.  This would break terribly as
 1252                          * shm_open(2) actually sets F_SEAL_SEAL to maintain
 1253                          * historical behavior where the underlying file could
 1254                          * not be sealed.
 1255                          */
 1256                         initial_seals &= ~shmfd->shm_seals;
 1257 
 1258                         /*
 1259                          * Object already exists, obtain a new
 1260                          * reference if requested and permitted.
 1261                          */
 1262                         free(path, M_SHMFD);
 1263 
 1264                         /*
 1265                          * initial_seals can't set additional seals if we've
 1266                          * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
 1267                          * then we've already removed that one from
 1268                          * initial_seals.  This is currently redundant as we
 1269                          * only allow setting F_SEAL_SEAL at creation time, but
 1270                          * it's cheap to check and decreases the effort required
 1271                          * to allow additional seals.
 1272                          */
 1273                         if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
 1274                             initial_seals != 0)
 1275                                 error = EPERM;
 1276                         else if ((flags & (O_CREAT | O_EXCL)) ==
 1277                             (O_CREAT | O_EXCL))
 1278                                 error = EEXIST;
 1279                         else if (shmflags != 0 && shmflags != shmfd->shm_flags)
 1280                                 error = EINVAL;
 1281                         else {
 1282 #ifdef MAC
 1283                                 error = mac_posixshm_check_open(td->td_ucred,
 1284                                     shmfd, FFLAGS(flags & O_ACCMODE));
 1285                                 if (error == 0)
 1286 #endif
 1287                                 error = shm_access(shmfd, td->td_ucred,
 1288                                     FFLAGS(flags & O_ACCMODE));
 1289                         }
 1290 
 1291                         /*
 1292                          * Truncate the file back to zero length if
 1293                          * O_TRUNC was specified and the object was
 1294                          * opened with read/write.
 1295                          */
 1296                         if (error == 0 &&
 1297                             (flags & (O_ACCMODE | O_TRUNC)) ==
 1298                             (O_RDWR | O_TRUNC)) {
 1299                                 VM_OBJECT_WLOCK(shmfd->shm_object);
 1300 #ifdef MAC
 1301                                 error = mac_posixshm_check_truncate(
 1302                                         td->td_ucred, fp->f_cred, shmfd);
 1303                                 if (error == 0)
 1304 #endif
 1305                                         error = shm_dotruncate_locked(shmfd, 0,
 1306                                             rl_cookie);
 1307                                 VM_OBJECT_WUNLOCK(shmfd->shm_object);
 1308                         }
 1309                         if (error == 0) {
 1310                                 /*
 1311                                  * Currently we only allow F_SEAL_SEAL to be
 1312                                  * set initially.  As noted above, this would
 1313                                  * need to be reworked should that change.
 1314                                  */
 1315                                 shmfd->shm_seals |= initial_seals;
 1316                                 shm_hold(shmfd);
 1317                         }
 1318                         rangelock_unlock(&shmfd->shm_rl, rl_cookie,
 1319                             &shmfd->shm_mtx);
 1320                 }
 1321                 sx_xunlock(&shm_dict_lock);
 1322 
 1323                 if (error) {
 1324                         fdclose(td, fp, fd);
 1325                         fdrop(fp, td);
 1326                         return (error);
 1327                 }
 1328         }
 1329 
 1330         finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 1331 
 1332         td->td_retval[0] = fd;
 1333         fdrop(fp, td);
 1334 
 1335         return (0);
 1336 }
 1337 
 1338 /* System calls. */
 1339 #ifdef COMPAT_FREEBSD12
 1340 int
 1341 freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
 1342 {
 1343 
 1344         return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC,
 1345             uap->mode, NULL));
 1346 }
 1347 #endif
 1348 
 1349 int
 1350 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 1351 {
 1352         char *path;
 1353         Fnv32_t fnv;
 1354         int error;
 1355 
 1356         error = shm_copyin_path(td, uap->path, &path);
 1357         if (error != 0)
 1358                 return (error);
 1359 
 1360         AUDIT_ARG_UPATH1_CANON(path);
 1361         fnv = fnv_32_str(path, FNV1_32_INIT);
 1362         sx_xlock(&shm_dict_lock);
 1363         error = shm_remove(path, fnv, td->td_ucred);
 1364         sx_xunlock(&shm_dict_lock);
 1365         free(path, M_SHMFD);
 1366 
 1367         return (error);
 1368 }
 1369 
 1370 int
 1371 sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
 1372 {
 1373         char *path_from = NULL, *path_to = NULL;
 1374         Fnv32_t fnv_from, fnv_to;
 1375         struct shmfd *fd_from;
 1376         struct shmfd *fd_to;
 1377         int error;
 1378         int flags;
 1379 
 1380         flags = uap->flags;
 1381         AUDIT_ARG_FFLAGS(flags);
 1382 
 1383         /*
 1384          * Make sure the user passed only valid flags.
 1385          * If you add a new flag, please add a new term here.
 1386          */
 1387         if ((flags & ~(
 1388             SHM_RENAME_NOREPLACE |
 1389             SHM_RENAME_EXCHANGE
 1390             )) != 0) {
 1391                 error = EINVAL;
 1392                 goto out;
 1393         }
 1394 
 1395         /*
 1396          * EXCHANGE and NOREPLACE don't quite make sense together. Let's
 1397          * force the user to choose one or the other.
 1398          */
 1399         if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
 1400             (flags & SHM_RENAME_EXCHANGE) != 0) {
 1401                 error = EINVAL;
 1402                 goto out;
 1403         }
 1404 
 1405         /* Renaming to or from anonymous makes no sense */
 1406         if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) {
 1407                 error = EINVAL;
 1408                 goto out;
 1409         }
 1410 
 1411         error = shm_copyin_path(td, uap->path_from, &path_from);
 1412         if (error != 0)
 1413                 goto out;
 1414 
 1415         error = shm_copyin_path(td, uap->path_to, &path_to);
 1416         if (error != 0)
 1417                 goto out;
 1418 
 1419         AUDIT_ARG_UPATH1_CANON(path_from);
 1420         AUDIT_ARG_UPATH2_CANON(path_to);
 1421 
 1422         /* Rename with from/to equal is a no-op */
 1423         if (strcmp(path_from, path_to) == 0)
 1424                 goto out;
 1425 
 1426         fnv_from = fnv_32_str(path_from, FNV1_32_INIT);
 1427         fnv_to = fnv_32_str(path_to, FNV1_32_INIT);
 1428 
 1429         sx_xlock(&shm_dict_lock);
 1430 
 1431         fd_from = shm_lookup(path_from, fnv_from);
 1432         if (fd_from == NULL) {
 1433                 error = ENOENT;
 1434                 goto out_locked;
 1435         }
 1436 
 1437         fd_to = shm_lookup(path_to, fnv_to);
 1438         if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) {
 1439                 error = EEXIST;
 1440                 goto out_locked;
 1441         }
 1442 
 1443         /*
 1444          * Unconditionally prevents shm_remove from invalidating the 'from'
 1445          * shm's state.
 1446          */
 1447         shm_hold(fd_from);
 1448         error = shm_remove(path_from, fnv_from, td->td_ucred);
 1449 
 1450         /*
 1451          * One of my assumptions failed if ENOENT (e.g. locking didn't
 1452          * protect us)
 1453          */
 1454         KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s",
 1455             path_from));
 1456         if (error != 0) {
 1457                 shm_drop(fd_from);
 1458                 goto out_locked;
 1459         }
 1460 
 1461         /*
 1462          * If we are exchanging, we need to ensure the shm_remove below
 1463          * doesn't invalidate the dest shm's state.
 1464          */
 1465         if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL)
 1466                 shm_hold(fd_to);
 1467 
 1468         /*
 1469          * NOTE: if path_to is not already in the hash, c'est la vie;
 1470          * it simply means we have nothing already at path_to to unlink.
 1471          * That is the ENOENT case.
 1472          *
 1473          * If we somehow don't have access to unlink this guy, but
 1474          * did for the shm at path_from, then relink the shm to path_from
 1475          * and abort with EACCES.
 1476          *
 1477          * All other errors: that is weird; let's relink and abort the
 1478          * operation.
 1479          */
 1480         error = shm_remove(path_to, fnv_to, td->td_ucred);
 1481         if (error != 0 && error != ENOENT) {
 1482                 shm_insert(path_from, fnv_from, fd_from);
 1483                 shm_drop(fd_from);
 1484                 /* Don't free path_from now, since the hash references it */
 1485                 path_from = NULL;
 1486                 goto out_locked;
 1487         }
 1488 
 1489         error = 0;
 1490 
 1491         shm_insert(path_to, fnv_to, fd_from);
 1492 
 1493         /* Don't free path_to now, since the hash references it */
 1494         path_to = NULL;
 1495 
 1496         /* We kept a ref when we removed, and incremented again in insert */
 1497         shm_drop(fd_from);
 1498         KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 1499             fd_from->shm_refs));
 1500 
 1501         if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) {
 1502                 shm_insert(path_from, fnv_from, fd_to);
 1503                 path_from = NULL;
 1504                 shm_drop(fd_to);
 1505                 KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 1506                     fd_to->shm_refs));
 1507         }
 1508 
 1509 out_locked:
 1510         sx_xunlock(&shm_dict_lock);
 1511 
 1512 out:
 1513         free(path_from, M_SHMFD);
 1514         free(path_to, M_SHMFD);
 1515         return (error);
 1516 }
 1517 
 1518 static int
 1519 shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
 1520     vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
 1521     vm_ooffset_t foff, struct thread *td)
 1522 {
 1523         struct vmspace *vms;
 1524         vm_map_entry_t next_entry, prev_entry;
 1525         vm_offset_t align, mask, maxaddr;
 1526         int docow, error, rv, try;
 1527         bool curmap;
 1528 
 1529         if (shmfd->shm_lp_psind == 0)
 1530                 return (EINVAL);
 1531 
 1532         /* MAP_PRIVATE is disabled */
 1533         if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL |
 1534             MAP_NOCORE |
 1535 #ifdef MAP_32BIT
 1536             MAP_32BIT |
 1537 #endif
 1538             MAP_ALIGNMENT_MASK)) != 0)
 1539                 return (EINVAL);
 1540 
 1541         vms = td->td_proc->p_vmspace;
 1542         curmap = map == &vms->vm_map;
 1543         if (curmap) {
 1544                 error = kern_mmap_racct_check(td, map, size);
 1545                 if (error != 0)
 1546                         return (error);
 1547         }
 1548 
 1549         docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
 1550         docow |= MAP_INHERIT_SHARE;
 1551         if ((flags & MAP_NOCORE) != 0)
 1552                 docow |= MAP_DISABLE_COREDUMP;
 1553 
 1554         mask = pagesizes[shmfd->shm_lp_psind] - 1;
 1555         if ((foff & mask) != 0)
 1556                 return (EINVAL);
 1557         maxaddr = vm_map_max(map);
 1558 #ifdef MAP_32BIT
 1559         if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
 1560                 maxaddr = MAP_32BIT_MAX_ADDR;
 1561 #endif
 1562         if (size == 0 || (size & mask) != 0 ||
 1563             (*addr != 0 && ((*addr & mask) != 0 ||
 1564             *addr + size < *addr || *addr + size > maxaddr)))
 1565                 return (EINVAL);
 1566 
 1567         align = flags & MAP_ALIGNMENT_MASK;
 1568         if (align == 0) {
 1569                 align = pagesizes[shmfd->shm_lp_psind];
 1570         } else if (align == MAP_ALIGNED_SUPER) {
 1571                 if (shmfd->shm_lp_psind != 1)
 1572                         return (EINVAL);
 1573                 align = pagesizes[1];
 1574         } else {
 1575                 align >>= MAP_ALIGNMENT_SHIFT;
 1576                 align = 1ULL << align;
 1577                 /* Also handles overflow. */
 1578                 if (align < pagesizes[shmfd->shm_lp_psind])
 1579                         return (EINVAL);
 1580         }
 1581 
 1582         vm_map_lock(map);
 1583         if ((flags & MAP_FIXED) == 0) {
 1584                 try = 1;
 1585                 if (curmap && (*addr == 0 ||
 1586                     (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 1587                     *addr < round_page((vm_offset_t)vms->vm_daddr +
 1588                     lim_max(td, RLIMIT_DATA))))) {
 1589                         *addr = roundup2((vm_offset_t)vms->vm_daddr +
 1590                             lim_max(td, RLIMIT_DATA),
 1591                             pagesizes[shmfd->shm_lp_psind]);
 1592                 }
 1593 again:
 1594                 rv = vm_map_find_aligned(map, addr, size, maxaddr, align);
 1595                 if (rv != KERN_SUCCESS) {
 1596                         if (try == 1) {
 1597                                 try = 2;
 1598                                 *addr = vm_map_min(map);
 1599                                 if ((*addr & mask) != 0)
 1600                                         *addr = (*addr + mask) & mask;
 1601                                 goto again;
 1602                         }
 1603                         goto fail1;
 1604                 }
 1605         } else if ((flags & MAP_EXCL) == 0) {
 1606                 rv = vm_map_delete(map, *addr, *addr + size);
 1607                 if (rv != KERN_SUCCESS)
 1608                         goto fail1;
 1609         } else {
 1610                 error = ENOSPC;
 1611                 if (vm_map_lookup_entry(map, *addr, &prev_entry))
 1612                         goto fail;
 1613                 next_entry = vm_map_entry_succ(prev_entry);
 1614                 if (next_entry->start < *addr + size)
 1615                         goto fail;
 1616         }
 1617 
 1618         rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
 1619             prot, max_prot, docow);
 1620 fail1:
 1621         error = vm_mmap_to_errno(rv);
 1622 fail:
 1623         vm_map_unlock(map);
 1624         return (error);
 1625 }
 1626 
 1627 static int
 1628 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
 1629     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
 1630     vm_ooffset_t foff, struct thread *td)
 1631 {
 1632         struct shmfd *shmfd;
 1633         vm_prot_t maxprot;
 1634         int error;
 1635         bool writecnt;
 1636         void *rl_cookie;
 1637 
 1638         shmfd = fp->f_data;
 1639         maxprot = VM_PROT_NONE;
 1640 
 1641         rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize,
 1642             &shmfd->shm_mtx);
 1643         /* FREAD should always be set. */
 1644         if ((fp->f_flag & FREAD) != 0)
 1645                 maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 1646 
 1647         /*
 1648          * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared
 1649          * mapping with a write seal applied.  Private mappings are always
 1650          * writeable.
 1651          */
 1652         if ((flags & MAP_SHARED) == 0) {
 1653                 cap_maxprot |= VM_PROT_WRITE;
 1654                 maxprot |= VM_PROT_WRITE;
 1655                 writecnt = false;
 1656         } else {
 1657                 if ((fp->f_flag & FWRITE) != 0 &&
 1658                     (shmfd->shm_seals & F_SEAL_WRITE) == 0)
 1659                         maxprot |= VM_PROT_WRITE;
 1660 
 1661                 /*
 1662                  * Any mappings from a writable descriptor may be upgraded to
 1663                  * VM_PROT_WRITE with mprotect(2), unless a write-seal was
 1664                  * applied between the open and subsequent mmap(2).  We want to
 1665                  * reject application of a write seal as long as any such
 1666                  * mapping exists so that the seal cannot be trivially bypassed.
 1667                  */
 1668                 writecnt = (maxprot & VM_PROT_WRITE) != 0;
 1669                 if (!writecnt && (prot & VM_PROT_WRITE) != 0) {
 1670                         error = EACCES;
 1671                         goto out;
 1672                 }
 1673         }
 1674         maxprot &= cap_maxprot;
 1675 
 1676         /* See comment in vn_mmap(). */
 1677         if (
 1678 #ifdef _LP64
 1679             objsize > OFF_MAX ||
 1680 #endif
 1681             foff > OFF_MAX - objsize) {
 1682                 error = EINVAL;
 1683                 goto out;
 1684         }
 1685 
 1686 #ifdef MAC
 1687         error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 1688         if (error != 0)
 1689                 goto out;
 1690 #endif
 1691 
 1692         mtx_lock(&shm_timestamp_lock);
 1693         vfs_timestamp(&shmfd->shm_atime);
 1694         mtx_unlock(&shm_timestamp_lock);
 1695         vm_object_reference(shmfd->shm_object);
 1696 
 1697         if (shm_largepage(shmfd)) {
 1698                 writecnt = false;
 1699                 error = shm_mmap_large(shmfd, map, addr, objsize, prot,
 1700                     maxprot, flags, foff, td);
 1701         } else {
 1702                 if (writecnt) {
 1703                         vm_pager_update_writecount(shmfd->shm_object, 0,
 1704                             objsize);
 1705                 }
 1706                 error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 1707                     shmfd->shm_object, foff, writecnt, td);
 1708         }
 1709         if (error != 0) {
 1710                 if (writecnt)
 1711                         vm_pager_release_writecount(shmfd->shm_object, 0,
 1712                             objsize);
 1713                 vm_object_deallocate(shmfd->shm_object);
 1714         }
 1715 out:
 1716         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 1717         return (error);
 1718 }
 1719 
 1720 static int
 1721 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 1722     struct thread *td)
 1723 {
 1724         struct shmfd *shmfd;
 1725         int error;
 1726 
 1727         error = 0;
 1728         shmfd = fp->f_data;
 1729         mtx_lock(&shm_timestamp_lock);
 1730         /*
 1731          * SUSv4 says that x bits of permission need not be affected.
 1732          * Be consistent with our shm_open there.
 1733          */
 1734 #ifdef MAC
 1735         error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 1736         if (error != 0)
 1737                 goto out;
 1738 #endif
 1739         error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 1740             VADMIN, active_cred);
 1741         if (error != 0)
 1742                 goto out;
 1743         shmfd->shm_mode = mode & ACCESSPERMS;
 1744 out:
 1745         mtx_unlock(&shm_timestamp_lock);
 1746         return (error);
 1747 }
 1748 
 1749 static int
 1750 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 1751     struct thread *td)
 1752 {
 1753         struct shmfd *shmfd;
 1754         int error;
 1755 
 1756         error = 0;
 1757         shmfd = fp->f_data;
 1758         mtx_lock(&shm_timestamp_lock);
 1759 #ifdef MAC
 1760         error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 1761         if (error != 0)
 1762                 goto out;
 1763 #endif
 1764         if (uid == (uid_t)-1)
 1765                 uid = shmfd->shm_uid;
 1766         if (gid == (gid_t)-1)
 1767                  gid = shmfd->shm_gid;
 1768         if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 1769             (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 1770             (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 1771                 goto out;
 1772         shmfd->shm_uid = uid;
 1773         shmfd->shm_gid = gid;
 1774 out:
 1775         mtx_unlock(&shm_timestamp_lock);
 1776         return (error);
 1777 }
 1778 
 1779 /*
 1780  * Helper routines to allow the backing object of a shared memory file
 1781  * descriptor to be mapped in the kernel.
 1782  */
 1783 int
 1784 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 1785 {
 1786         struct shmfd *shmfd;
 1787         vm_offset_t kva, ofs;
 1788         vm_object_t obj;
 1789         int rv;
 1790 
 1791         if (fp->f_type != DTYPE_SHM)
 1792                 return (EINVAL);
 1793         shmfd = fp->f_data;
 1794         obj = shmfd->shm_object;
 1795         VM_OBJECT_WLOCK(obj);
 1796         /*
 1797          * XXXRW: This validation is probably insufficient, and subject to
 1798          * sign errors.  It should be fixed.
 1799          */
 1800         if (offset >= shmfd->shm_size ||
 1801             offset + size > round_page(shmfd->shm_size)) {
 1802                 VM_OBJECT_WUNLOCK(obj);
 1803                 return (EINVAL);
 1804         }
 1805 
 1806         shmfd->shm_kmappings++;
 1807         vm_object_reference_locked(obj);
 1808         VM_OBJECT_WUNLOCK(obj);
 1809 
 1810         /* Map the object into the kernel_map and wire it. */
 1811         kva = vm_map_min(kernel_map);
 1812         ofs = offset & PAGE_MASK;
 1813         offset = trunc_page(offset);
 1814         size = round_page(size + ofs);
 1815         rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 1816             VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 1817             VM_PROT_READ | VM_PROT_WRITE, 0);
 1818         if (rv == KERN_SUCCESS) {
 1819                 rv = vm_map_wire(kernel_map, kva, kva + size,
 1820                     VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 1821                 if (rv == KERN_SUCCESS) {
 1822                         *memp = (void *)(kva + ofs);
 1823                         return (0);
 1824                 }
 1825                 vm_map_remove(kernel_map, kva, kva + size);
 1826         } else
 1827                 vm_object_deallocate(obj);
 1828 
 1829         /* On failure, drop our mapping reference. */
 1830         VM_OBJECT_WLOCK(obj);
 1831         shmfd->shm_kmappings--;
 1832         VM_OBJECT_WUNLOCK(obj);
 1833 
 1834         return (vm_mmap_to_errno(rv));
 1835 }
 1836 
 1837 /*
 1838  * We require the caller to unmap the entire entry.  This allows us to
 1839  * safely decrement shm_kmappings when a mapping is removed.
 1840  */
 1841 int
 1842 shm_unmap(struct file *fp, void *mem, size_t size)
 1843 {
 1844         struct shmfd *shmfd;
 1845         vm_map_entry_t entry;
 1846         vm_offset_t kva, ofs;
 1847         vm_object_t obj;
 1848         vm_pindex_t pindex;
 1849         vm_prot_t prot;
 1850         boolean_t wired;
 1851         vm_map_t map;
 1852         int rv;
 1853 
 1854         if (fp->f_type != DTYPE_SHM)
 1855                 return (EINVAL);
 1856         shmfd = fp->f_data;
 1857         kva = (vm_offset_t)mem;
 1858         ofs = kva & PAGE_MASK;
 1859         kva = trunc_page(kva);
 1860         size = round_page(size + ofs);
 1861         map = kernel_map;
 1862         rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 1863             &obj, &pindex, &prot, &wired);
 1864         if (rv != KERN_SUCCESS)
 1865                 return (EINVAL);
 1866         if (entry->start != kva || entry->end != kva + size) {
 1867                 vm_map_lookup_done(map, entry);
 1868                 return (EINVAL);
 1869         }
 1870         vm_map_lookup_done(map, entry);
 1871         if (obj != shmfd->shm_object)
 1872                 return (EINVAL);
 1873         vm_map_remove(map, kva, kva + size);
 1874         VM_OBJECT_WLOCK(obj);
 1875         KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 1876         shmfd->shm_kmappings--;
 1877         VM_OBJECT_WUNLOCK(obj);
 1878         return (0);
 1879 }
 1880 
 1881 static int
 1882 shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list)
 1883 {
 1884         const char *path, *pr_path;
 1885         size_t pr_pathlen;
 1886         bool visible;
 1887 
 1888         sx_assert(&shm_dict_lock, SA_LOCKED);
 1889         kif->kf_type = KF_TYPE_SHM;
 1890         kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;
 1891         kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 1892         if (shmfd->shm_path != NULL) {
 1893                 if (shmfd->shm_path != NULL) {
 1894                         path = shmfd->shm_path;
 1895                         pr_path = curthread->td_ucred->cr_prison->pr_path;
 1896                         if (strcmp(pr_path, "/") != 0) {
 1897                                 /* Return the jail-rooted pathname. */
 1898                                 pr_pathlen = strlen(pr_path);
 1899                                 visible = strncmp(path, pr_path, pr_pathlen)
 1900                                     == 0 && path[pr_pathlen] == '/';
 1901                                 if (list && !visible)
 1902                                         return (EPERM);
 1903                                 if (visible)
 1904                                         path += pr_pathlen;
 1905                         }
 1906                         strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 1907                 }
 1908         }
 1909         return (0);
 1910 }
 1911 
 1912 static int
 1913 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif,
 1914     struct filedesc *fdp __unused)
 1915 {
 1916         int res;
 1917 
 1918         sx_slock(&shm_dict_lock);
 1919         res = shm_fill_kinfo_locked(fp->f_data, kif, false);
 1920         sx_sunlock(&shm_dict_lock);
 1921         return (res);
 1922 }
 1923 
 1924 static int
 1925 shm_add_seals(struct file *fp, int seals)
 1926 {
 1927         struct shmfd *shmfd;
 1928         void *rl_cookie;
 1929         vm_ooffset_t writemappings;
 1930         int error, nseals;
 1931 
 1932         error = 0;
 1933         shmfd = fp->f_data;
 1934         rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 1935             &shmfd->shm_mtx);
 1936 
 1937         /* Even already-set seals should result in EPERM. */
 1938         if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) {
 1939                 error = EPERM;
 1940                 goto out;
 1941         }
 1942         nseals = seals & ~shmfd->shm_seals;
 1943         if ((nseals & F_SEAL_WRITE) != 0) {
 1944                 if (shm_largepage(shmfd)) {
 1945                         error = ENOTSUP;
 1946                         goto out;
 1947                 }
 1948 
 1949                 /*
 1950                  * The rangelock above prevents writable mappings from being
 1951                  * added after we've started applying seals.  The RLOCK here
 1952                  * is to avoid torn reads on ILP32 arches as unmapping/reducing
 1953                  * writemappings will be done without a rangelock.
 1954                  */
 1955                 VM_OBJECT_RLOCK(shmfd->shm_object);
 1956                 writemappings = shmfd->shm_object->un_pager.swp.writemappings;
 1957                 VM_OBJECT_RUNLOCK(shmfd->shm_object);
 1958                 /* kmappings are also writable */
 1959                 if (writemappings > 0) {
 1960                         error = EBUSY;
 1961                         goto out;
 1962                 }
 1963         }
 1964         shmfd->shm_seals |= nseals;
 1965 out:
 1966         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 1967         return (error);
 1968 }
 1969 
 1970 static int
 1971 shm_get_seals(struct file *fp, int *seals)
 1972 {
 1973         struct shmfd *shmfd;
 1974 
 1975         shmfd = fp->f_data;
 1976         *seals = shmfd->shm_seals;
 1977         return (0);
 1978 }
 1979 
 1980 static int
 1981 shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 1982 {
 1983         void *rl_cookie;
 1984         struct shmfd *shmfd;
 1985         size_t size;
 1986         int error;
 1987 
 1988         /* This assumes that the caller already checked for overflow. */
 1989         error = 0;
 1990         shmfd = fp->f_data;
 1991         size = offset + len;
 1992 
 1993         /*
 1994          * Just grab the rangelock for the range that we may be attempting to
 1995          * grow, rather than blocking read/write for regions we won't be
 1996          * touching while this (potential) resize is in progress.  Other
 1997          * attempts to resize the shmfd will have to take a write lock from 0 to
 1998          * OFF_MAX, so this being potentially beyond the current usable range of
 1999          * the shmfd is not necessarily a concern.  If other mechanisms are
 2000          * added to grow a shmfd, this may need to be re-evaluated.
 2001          */
 2002         rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size,
 2003             &shmfd->shm_mtx);
 2004         if (size > shmfd->shm_size)
 2005                 error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
 2006         rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 2007         /* Translate to posix_fallocate(2) return value as needed. */
 2008         if (error == ENOMEM)
 2009                 error = ENOSPC;
 2010         return (error);
 2011 }
 2012 
 2013 static int
 2014 sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS)
 2015 {
 2016         struct shm_mapping *shmm;
 2017         struct sbuf sb;
 2018         struct kinfo_file kif;
 2019         u_long i;
 2020         ssize_t curlen;
 2021         int error, error2;
 2022 
 2023         sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req);
 2024         sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 2025         curlen = 0;
 2026         error = 0;
 2027         sx_slock(&shm_dict_lock);
 2028         for (i = 0; i < shm_hash + 1; i++) {
 2029                 LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) {
 2030                         error = shm_fill_kinfo_locked(shmm->sm_shmfd,
 2031                             &kif, true);
 2032                         if (error == EPERM) {
 2033                                 error = 0;
 2034                                 continue;
 2035                         }
 2036                         if (error != 0)
 2037                                 break;
 2038                         pack_kinfo(&kif);
 2039                         error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ?
 2040                             0 : ENOMEM;
 2041                         if (error != 0)
 2042                                 break;
 2043                         curlen += kif.kf_structsize;
 2044                 }
 2045         }
 2046         sx_sunlock(&shm_dict_lock);
 2047         error2 = sbuf_finish(&sb);
 2048         sbuf_delete(&sb);
 2049         return (error != 0 ? error : error2);
 2050 }
 2051 
 2052 SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list,
 2053     CTLFLAG_RD | CTLFLAG_PRISON | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE,
 2054     NULL, 0, sysctl_posix_shm_list, "",
 2055     "POSIX SHM list");
 2056 
 2057 int
 2058 kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
 2059     struct filecaps *caps)
 2060 {
 2061 
 2062         return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
 2063 }
 2064 
 2065 /*
 2066  * This version of the shm_open() interface leaves CLOEXEC behavior up to the
 2067  * caller, and libc will enforce it for the traditional shm_open() call.  This
 2068  * allows other consumers, like memfd_create(), to opt-in for CLOEXEC.  This
 2069  * interface also includes a 'name' argument that is currently unused, but could
 2070  * potentially be exported later via some interface for debugging purposes.
 2071  * From the kernel's perspective, it is optional.  Individual consumers like
 2072  * memfd_create() may require it in order to be compatible with other systems
 2073  * implementing the same function.
 2074  */
 2075 int
 2076 sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 2077 {
 2078 
 2079         return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
 2080             uap->shmflags, NULL, uap->name));
 2081 }
Cache object: 6c38fed6d568c1e88620cb52d89ada89
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/uipc_shm.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/uipc_shm.c