The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_descrip.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: kern_descrip.c,v 1.251 2021/06/29 22:40:53 dholland Exp $      */
    2 
    3 /*-
    4  * Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1982, 1986, 1989, 1991, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  * (c) UNIX System Laboratories, Inc.
   36  * All or some portions of this file are derived from material licensed
   37  * to the University of California by American Telephone and Telegraph
   38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   39  * the permission of UNIX System Laboratories, Inc.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)kern_descrip.c      8.8 (Berkeley) 2/14/95
   66  */
   67 
   68 /*
   69  * File descriptor management.
   70  */
   71 
   72 #include <sys/cdefs.h>
   73 __KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.251 2021/06/29 22:40:53 dholland Exp $");
   74 
   75 #include <sys/param.h>
   76 #include <sys/systm.h>
   77 #include <sys/filedesc.h>
   78 #include <sys/kernel.h>
   79 #include <sys/proc.h>
   80 #include <sys/file.h>
   81 #include <sys/socket.h>
   82 #include <sys/socketvar.h>
   83 #include <sys/stat.h>
   84 #include <sys/ioctl.h>
   85 #include <sys/fcntl.h>
   86 #include <sys/pool.h>
   87 #include <sys/unistd.h>
   88 #include <sys/resourcevar.h>
   89 #include <sys/conf.h>
   90 #include <sys/event.h>
   91 #include <sys/kauth.h>
   92 #include <sys/atomic.h>
   93 #include <sys/syscallargs.h>
   94 #include <sys/cpu.h>
   95 #include <sys/kmem.h>
   96 #include <sys/vnode.h>
   97 #include <sys/sysctl.h>
   98 #include <sys/ktrace.h>
   99 
  100 /*
  101  * A list (head) of open files, counter, and lock protecting them.
  102  */
  103 struct filelist         filehead        __cacheline_aligned;
  104 static u_int            nfiles          __cacheline_aligned;
  105 kmutex_t                filelist_lock   __cacheline_aligned;
  106 
  107 static pool_cache_t     filedesc_cache  __read_mostly;
  108 static pool_cache_t     file_cache      __read_mostly;
  109 static pool_cache_t     fdfile_cache    __read_mostly;
  110 
  111 static int      file_ctor(void *, void *, int);
  112 static void     file_dtor(void *, void *);
  113 static int      fdfile_ctor(void *, void *, int);
  114 static void     fdfile_dtor(void *, void *);
  115 static int      filedesc_ctor(void *, void *, int);
  116 static void     filedesc_dtor(void *, void *);
  117 static int      filedescopen(dev_t, int, int, lwp_t *);
  118 
  119 static int sysctl_kern_file(SYSCTLFN_PROTO);
  120 static int sysctl_kern_file2(SYSCTLFN_PROTO);
  121 static void fill_file(struct file *, const struct file *);
  122 static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *,
  123                       int, pid_t);
  124 
  125 const struct cdevsw filedesc_cdevsw = {
  126         .d_open = filedescopen,
  127         .d_close = noclose,
  128         .d_read = noread,
  129         .d_write = nowrite,
  130         .d_ioctl = noioctl,
  131         .d_stop = nostop,
  132         .d_tty = notty,
  133         .d_poll = nopoll,
  134         .d_mmap = nommap,
  135         .d_kqfilter = nokqfilter,
  136         .d_discard = nodiscard,
  137         .d_flag = D_OTHER | D_MPSAFE
  138 };
  139 
  140 /* For ease of reading. */
  141 __strong_alias(fd_putvnode,fd_putfile)
  142 __strong_alias(fd_putsock,fd_putfile)
  143 
  144 /*
  145  * Initialize the descriptor system.
  146  */
  147 void
  148 fd_sys_init(void)
  149 {
  150         static struct sysctllog *clog;
  151 
  152         mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
  153 
  154         LIST_INIT(&filehead);
  155 
  156         file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
  157             0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
  158         KASSERT(file_cache != NULL);
  159 
  160         fdfile_cache = pool_cache_init(sizeof(fdfile_t), coherency_unit, 0,
  161             PR_LARGECACHE, "fdfile", NULL, IPL_NONE, fdfile_ctor, fdfile_dtor,
  162             NULL);
  163         KASSERT(fdfile_cache != NULL);
  164 
  165         filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
  166             0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
  167             NULL);
  168         KASSERT(filedesc_cache != NULL);
  169 
  170         sysctl_createv(&clog, 0, NULL, NULL,
  171                        CTLFLAG_PERMANENT,
  172                        CTLTYPE_STRUCT, "file",
  173                        SYSCTL_DESCR("System open file table"),
  174                        sysctl_kern_file, 0, NULL, 0,
  175                        CTL_KERN, KERN_FILE, CTL_EOL);
  176         sysctl_createv(&clog, 0, NULL, NULL,
  177                        CTLFLAG_PERMANENT,
  178                        CTLTYPE_STRUCT, "file2",
  179                        SYSCTL_DESCR("System open file table"),
  180                        sysctl_kern_file2, 0, NULL, 0,
  181                        CTL_KERN, KERN_FILE2, CTL_EOL);
  182 }
  183 
  184 static bool
  185 fd_isused(filedesc_t *fdp, unsigned fd)
  186 {
  187         u_int off = fd >> NDENTRYSHIFT;
  188 
  189         KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);
  190 
  191         return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0;
  192 }
  193 
  194 /*
  195  * Verify that the bitmaps match the descriptor table.
  196  */
  197 static inline void
  198 fd_checkmaps(filedesc_t *fdp)
  199 {
  200 #ifdef DEBUG
  201         fdtab_t *dt;
  202         u_int fd;
  203 
  204         KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock));
  205 
  206         dt = fdp->fd_dt;
  207         if (fdp->fd_refcnt == -1) {
  208                 /*
  209                  * fd_free tears down the table without maintaining its bitmap.
  210                  */
  211                 return;
  212         }
  213         for (fd = 0; fd < dt->dt_nfiles; fd++) {
  214                 if (fd < NDFDFILE) {
  215                         KASSERT(dt->dt_ff[fd] ==
  216                             (fdfile_t *)fdp->fd_dfdfile[fd]);
  217                 }
  218                 if (dt->dt_ff[fd] == NULL) {
  219                         KASSERT(!fd_isused(fdp, fd));
  220                 } else if (dt->dt_ff[fd]->ff_file != NULL) {
  221                         KASSERT(fd_isused(fdp, fd));
  222                 }
  223         }
  224 #endif
  225 }
  226 
  227 static int
  228 fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
  229 {
  230         int i, off, maxoff;
  231         uint32_t sub;
  232 
  233         KASSERT(mutex_owned(&fdp->fd_lock));
  234 
  235         fd_checkmaps(fdp);
  236 
  237         if (want > bits)
  238                 return -1;
  239 
  240         off = want >> NDENTRYSHIFT;
  241         i = want & NDENTRYMASK;
  242         if (i) {
  243                 sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
  244                 if (sub != ~0)
  245                         goto found;
  246                 off++;
  247         }
  248 
  249         maxoff = NDLOSLOTS(bits);
  250         while (off < maxoff) {
  251                 if ((sub = bitmap[off]) != ~0)
  252                         goto found;
  253                 off++;
  254         }
  255 
  256         return -1;
  257 
  258  found:
  259         return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
  260 }
  261 
  262 static int
  263 fd_last_set(filedesc_t *fd, int last)
  264 {
  265         int off, i;
  266         fdfile_t **ff = fd->fd_dt->dt_ff;
  267         uint32_t *bitmap = fd->fd_lomap;
  268 
  269         KASSERT(mutex_owned(&fd->fd_lock));
  270 
  271         fd_checkmaps(fd);
  272 
  273         off = (last - 1) >> NDENTRYSHIFT;
  274 
  275         while (off >= 0 && !bitmap[off])
  276                 off--;
  277 
  278         if (off < 0)
  279                 return -1;
  280 
  281         i = ((off + 1) << NDENTRYSHIFT) - 1;
  282         if (i >= last)
  283                 i = last - 1;
  284 
  285         /* XXX should use bitmap */
  286         while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
  287                 i--;
  288 
  289         return i;
  290 }
  291 
  292 static inline void
  293 fd_used(filedesc_t *fdp, unsigned fd)
  294 {
  295         u_int off = fd >> NDENTRYSHIFT;
  296         fdfile_t *ff;
  297 
  298         ff = fdp->fd_dt->dt_ff[fd];
  299 
  300         KASSERT(mutex_owned(&fdp->fd_lock));
  301         KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0);
  302         KASSERT(ff != NULL);
  303         KASSERT(ff->ff_file == NULL);
  304         KASSERT(!ff->ff_allocated);
  305 
  306         ff->ff_allocated = true;
  307         fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK);
  308         if (__predict_false(fdp->fd_lomap[off] == ~0)) {
  309                 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
  310                     (1U << (off & NDENTRYMASK))) == 0);
  311                 fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK);
  312         }
  313 
  314         if ((int)fd > fdp->fd_lastfile) {
  315                 fdp->fd_lastfile = fd;
  316         }
  317 
  318         fd_checkmaps(fdp);
  319 }
  320 
  321 static inline void
  322 fd_unused(filedesc_t *fdp, unsigned fd)
  323 {
  324         u_int off = fd >> NDENTRYSHIFT;
  325         fdfile_t *ff;
  326 
  327         ff = fdp->fd_dt->dt_ff[fd];
  328 
  329         KASSERT(mutex_owned(&fdp->fd_lock));
  330         KASSERT(ff != NULL);
  331         KASSERT(ff->ff_file == NULL);
  332         KASSERT(ff->ff_allocated);
  333 
  334         if (fd < fdp->fd_freefile) {
  335                 fdp->fd_freefile = fd;
  336         }
  337 
  338         if (fdp->fd_lomap[off] == ~0) {
  339                 KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
  340                     (1U << (off & NDENTRYMASK))) != 0);
  341                 fdp->fd_himap[off >> NDENTRYSHIFT] &=
  342                     ~(1U << (off & NDENTRYMASK));
  343         }
  344         KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0);
  345         fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK));
  346         ff->ff_allocated = false;
  347 
  348         KASSERT(fd <= fdp->fd_lastfile);
  349         if (fd == fdp->fd_lastfile) {
  350                 fdp->fd_lastfile = fd_last_set(fdp, fd);
  351         }
  352         fd_checkmaps(fdp);
  353 }
  354 
  355 /*
  356  * Look up the file structure corresponding to a file descriptor
  357  * and return the file, holding a reference on the descriptor.
  358  */
  359 file_t *
  360 fd_getfile(unsigned fd)
  361 {
  362         filedesc_t *fdp;
  363         fdfile_t *ff;
  364         file_t *fp;
  365         fdtab_t *dt;
  366 
  367         /*
  368          * Look up the fdfile structure representing this descriptor.
  369          * We are doing this unlocked.  See fd_tryexpand().
  370          */
  371         fdp = curlwp->l_fd;
  372         dt = atomic_load_consume(&fdp->fd_dt);
  373         if (__predict_false(fd >= dt->dt_nfiles)) {
  374                 return NULL;
  375         }
  376         ff = dt->dt_ff[fd];
  377         KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
  378         if (__predict_false(ff == NULL)) {
  379                 return NULL;
  380         }
  381 
  382         /* Now get a reference to the descriptor. */
  383         if (fdp->fd_refcnt == 1) {
  384                 /*
  385                  * Single threaded: don't need to worry about concurrent
  386                  * access (other than earlier calls to kqueue, which may
  387                  * hold a reference to the descriptor).
  388                  */
  389                 ff->ff_refcnt++;
  390         } else {
  391                 /*
  392                  * Multi threaded: issue a memory barrier to ensure that we
  393                  * acquire the file pointer _after_ adding a reference.  If
  394                  * no memory barrier, we could fetch a stale pointer.
  395                  */
  396                 atomic_inc_uint(&ff->ff_refcnt);
  397 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  398                 membar_enter();
  399 #endif
  400         }
  401 
  402         /*
  403          * If the file is not open or is being closed then put the
  404          * reference back.
  405          */
  406         fp = atomic_load_consume(&ff->ff_file);
  407         if (__predict_true(fp != NULL)) {
  408                 return fp;
  409         }
  410         fd_putfile(fd);
  411         return NULL;
  412 }
  413 
  414 /*
  415  * Release a reference to a file descriptor acquired with fd_getfile().
  416  */
  417 void
  418 fd_putfile(unsigned fd)
  419 {
  420         filedesc_t *fdp;
  421         fdfile_t *ff;
  422         u_int u, v;
  423 
  424         fdp = curlwp->l_fd;
  425         KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);
  426         ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
  427 
  428         KASSERT(ff != NULL);
  429         KASSERT((ff->ff_refcnt & FR_MASK) > 0);
  430         KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
  431 
  432         if (fdp->fd_refcnt == 1) {
  433                 /*
  434                  * Single threaded: don't need to worry about concurrent
  435                  * access (other than earlier calls to kqueue, which may
  436                  * hold a reference to the descriptor).
  437                  */
  438                 if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
  439                         fd_close(fd);
  440                         return;
  441                 }
  442                 ff->ff_refcnt--;
  443                 return;
  444         }
  445 
  446         /*
  447          * Ensure that any use of the file is complete and globally
  448          * visible before dropping the final reference.  If no membar,
  449          * the current CPU could still access memory associated with
  450          * the file after it has been freed or recycled by another
  451          * CPU.
  452          */
  453 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  454         membar_exit();
  455 #endif
  456 
  457         /*
  458          * Be optimistic and start out with the assumption that no other
  459          * threads are trying to close the descriptor.  If the CAS fails,
  460          * we lost a race and/or it's being closed.
  461          */
  462         for (u = ff->ff_refcnt & FR_MASK;; u = v) {
  463                 v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
  464                 if (__predict_true(u == v)) {
  465                         return;
  466                 }
  467                 if (__predict_false((v & FR_CLOSING) != 0)) {
  468                         break;
  469                 }
  470         }
  471 
  472         /* Another thread is waiting to close the file: join it. */
  473         (void)fd_close(fd);
  474 }
  475 
  476 /*
  477  * Convenience wrapper around fd_getfile() that returns reference
  478  * to a vnode.
  479  */
  480 int
  481 fd_getvnode(unsigned fd, file_t **fpp)
  482 {
  483         vnode_t *vp;
  484         file_t *fp;
  485 
  486         fp = fd_getfile(fd);
  487         if (__predict_false(fp == NULL)) {
  488                 return EBADF;
  489         }
  490         if (__predict_false(fp->f_type != DTYPE_VNODE)) {
  491                 fd_putfile(fd);
  492                 return EINVAL;
  493         }
  494         vp = fp->f_vnode;
  495         if (__predict_false(vp->v_type == VBAD)) {
  496                 /* XXX Is this case really necessary? */
  497                 fd_putfile(fd);
  498                 return EBADF;
  499         }
  500         *fpp = fp;
  501         return 0;
  502 }
  503 
  504 /*
  505  * Convenience wrapper around fd_getfile() that returns reference
  506  * to a socket.
  507  */
  508 int
  509 fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
  510 {
  511         *fp = fd_getfile(fd);
  512         if (__predict_false(*fp == NULL)) {
  513                 return EBADF;
  514         }
  515         if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
  516                 fd_putfile(fd);
  517                 return ENOTSOCK;
  518         }
  519         *sop = (*fp)->f_socket;
  520         return 0;
  521 }
  522 
  523 int
  524 fd_getsock(unsigned fd, struct socket **sop)
  525 {
  526         file_t *fp;
  527         return fd_getsock1(fd, sop, &fp);
  528 }
  529 
  530 /*
  531  * Look up the file structure corresponding to a file descriptor
  532  * and return it with a reference held on the file, not the
  533  * descriptor.
  534  *
  535  * This is heavyweight and only used when accessing descriptors
  536  * from a foreign process.  The caller must ensure that `p' does
  537  * not exit or fork across this call.
  538  *
  539  * To release the file (not descriptor) reference, use closef().
  540  */
  541 file_t *
  542 fd_getfile2(proc_t *p, unsigned fd)
  543 {
  544         filedesc_t *fdp;
  545         fdfile_t *ff;
  546         file_t *fp;
  547         fdtab_t *dt;
  548 
  549         fdp = p->p_fd;
  550         mutex_enter(&fdp->fd_lock);
  551         dt = fdp->fd_dt;
  552         if (fd >= dt->dt_nfiles) {
  553                 mutex_exit(&fdp->fd_lock);
  554                 return NULL;
  555         }
  556         if ((ff = dt->dt_ff[fd]) == NULL) {
  557                 mutex_exit(&fdp->fd_lock);
  558                 return NULL;
  559         }
  560         if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
  561                 mutex_exit(&fdp->fd_lock);
  562                 return NULL;
  563         }
  564         mutex_enter(&fp->f_lock);
  565         fp->f_count++;
  566         mutex_exit(&fp->f_lock);
  567         mutex_exit(&fdp->fd_lock);
  568 
  569         return fp;
  570 }
  571 
  572 /*
  573  * Internal form of close.  Must be called with a reference to the
  574  * descriptor, and will drop the reference.  When all descriptor
  575  * references are dropped, releases the descriptor slot and a single
  576  * reference to the file structure.
  577  */
  578 int
  579 fd_close(unsigned fd)
  580 {
  581         struct flock lf;
  582         filedesc_t *fdp;
  583         fdfile_t *ff;
  584         file_t *fp;
  585         proc_t *p;
  586         lwp_t *l;
  587         u_int refcnt;
  588 
  589         l = curlwp;
  590         p = l->l_proc;
  591         fdp = l->l_fd;
  592         ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
  593 
  594         KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
  595 
  596         mutex_enter(&fdp->fd_lock);
  597         KASSERT((ff->ff_refcnt & FR_MASK) > 0);
  598         fp = atomic_load_consume(&ff->ff_file);
  599         if (__predict_false(fp == NULL)) {
  600                 /*
  601                  * Another user of the file is already closing, and is
  602                  * waiting for other users of the file to drain.  Release
  603                  * our reference, and wake up the closer.
  604                  */
  605                 atomic_dec_uint(&ff->ff_refcnt);
  606                 cv_broadcast(&ff->ff_closing);
  607                 mutex_exit(&fdp->fd_lock);
  608 
  609                 /*
  610                  * An application error, so pretend that the descriptor
  611                  * was already closed.  We can't safely wait for it to
  612                  * be closed without potentially deadlocking.
  613                  */
  614                 return (EBADF);
  615         }
  616         KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
  617 
  618         /*
  619          * There may be multiple users of this file within the process.
  620          * Notify existing and new users that the file is closing.  This
  621          * will prevent them from adding additional uses to this file
  622          * while we are closing it.
  623          */
  624         ff->ff_file = NULL;
  625         ff->ff_exclose = false;
  626 
  627         /*
  628          * We expect the caller to hold a descriptor reference - drop it.
  629          * The reference count may increase beyond zero at this point due
  630          * to an erroneous descriptor reference by an application, but
  631          * fd_getfile() will notice that the file is being closed and drop
  632          * the reference again.
  633          */
  634         if (fdp->fd_refcnt == 1) {
  635                 /* Single threaded. */
  636                 refcnt = --(ff->ff_refcnt);
  637         } else {
  638                 /* Multi threaded. */
  639 #ifndef __HAVE_ATOMIC_AS_MEMBAR
  640                 membar_producer();
  641 #endif
  642                 refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
  643         }
  644         if (__predict_false(refcnt != 0)) {
  645                 /*
  646                  * Wait for other references to drain.  This is typically
  647                  * an application error - the descriptor is being closed
  648                  * while still in use.
  649                  * (Or just a threaded application trying to unblock its
  650                  * thread that sleeps in (say) accept()).
  651                  */
  652                 atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
  653 
  654                 /*
  655                  * Remove any knotes attached to the file.  A knote
  656                  * attached to the descriptor can hold references on it.
  657                  */
  658                 mutex_exit(&fdp->fd_lock);
  659                 if (!SLIST_EMPTY(&ff->ff_knlist)) {
  660                         knote_fdclose(fd);
  661                 }
  662 
  663                 /*
  664                  * Since the file system code doesn't know which fd
  665                  * each request came from (think dup()), we have to
  666                  * ask it to return ERESTART for any long-term blocks.
  667                  * The re-entry through read/write/etc will detect the
  668                  * closed fd and return EBAFD.
  669                  * Blocked partial writes may return a short length.
  670                  */
  671                 (*fp->f_ops->fo_restart)(fp);
  672                 mutex_enter(&fdp->fd_lock);
  673 
  674                 /*
  675                  * We need to see the count drop to zero at least once,
  676                  * in order to ensure that all pre-existing references
  677                  * have been drained.  New references past this point are
  678                  * of no interest.
  679                  * XXX (dsl) this may need to call fo_restart() after a
  680                  * timeout to guarantee that all the system calls exit.
  681                  */
  682                 while ((ff->ff_refcnt & FR_MASK) != 0) {
  683                         cv_wait(&ff->ff_closing, &fdp->fd_lock);
  684                 }
  685                 atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
  686         } else {
  687                 /* If no references, there must be no knotes. */
  688                 KASSERT(SLIST_EMPTY(&ff->ff_knlist));
  689         }
  690 
  691         /*
  692          * POSIX record locking dictates that any close releases ALL
  693          * locks owned by this process.  This is handled by setting
  694          * a flag in the unlock to free ONLY locks obeying POSIX
  695          * semantics, and not to free BSD-style file locks.
  696          * If the descriptor was in a message, POSIX-style locks
  697          * aren't passed with the descriptor.
  698          */
  699         if (__predict_false((p->p_flag & PK_ADVLOCK) != 0 &&
  700             fp->f_type == DTYPE_VNODE)) {
  701                 lf.l_whence = SEEK_SET;
  702                 lf.l_start = 0;
  703                 lf.l_len = 0;
  704                 lf.l_type = F_UNLCK;
  705                 mutex_exit(&fdp->fd_lock);
  706                 (void)VOP_ADVLOCK(fp->f_vnode, p, F_UNLCK, &lf, F_POSIX);
  707                 mutex_enter(&fdp->fd_lock);
  708         }
  709 
  710         /* Free descriptor slot. */
  711         fd_unused(fdp, fd);
  712         mutex_exit(&fdp->fd_lock);
  713 
  714         /* Now drop reference to the file itself. */
  715         return closef(fp);
  716 }
  717 
  718 /*
  719  * Duplicate a file descriptor.
  720  */
  721 int
  722 fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
  723 {
  724         proc_t *p = curproc;
  725         fdtab_t *dt;
  726         int error;
  727 
  728         while ((error = fd_alloc(p, minfd, newp)) != 0) {
  729                 if (error != ENOSPC) {
  730                         return error;
  731                 }
  732                 fd_tryexpand(p);
  733         }
  734 
  735         dt = atomic_load_consume(&curlwp->l_fd->fd_dt);
  736         dt->dt_ff[*newp]->ff_exclose = exclose;
  737         fd_affix(p, fp, *newp);
  738         return 0;
  739 }
  740 
  741 /*
  742  * dup2 operation.
  743  */
  744 int
  745 fd_dup2(file_t *fp, unsigned newfd, int flags)
  746 {
  747         filedesc_t *fdp = curlwp->l_fd;
  748         fdfile_t *ff;
  749         fdtab_t *dt;
  750 
  751         if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
  752                 return EINVAL;
  753         /*
  754          * Ensure there are enough slots in the descriptor table,
  755          * and allocate an fdfile_t up front in case we need it.
  756          */
  757         while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) {
  758                 fd_tryexpand(curproc);
  759         }
  760         ff = pool_cache_get(fdfile_cache, PR_WAITOK);
  761 
  762         /*
  763          * If there is already a file open, close it.  If the file is
  764          * half open, wait for it to be constructed before closing it.
  765          * XXX Potential for deadlock here?
  766          */
  767         mutex_enter(&fdp->fd_lock);
  768         while (fd_isused(fdp, newfd)) {
  769                 mutex_exit(&fdp->fd_lock);
  770                 if (fd_getfile(newfd) != NULL) {
  771                         (void)fd_close(newfd);
  772                 } else {
  773                         /*
  774                          * Crummy, but unlikely to happen.
  775                          * Can occur if we interrupt another
  776                          * thread while it is opening a file.
  777                          */
  778                         kpause("dup2", false, 1, NULL);
  779                 }
  780                 mutex_enter(&fdp->fd_lock);
  781         }
  782         dt = fdp->fd_dt;
  783         if (dt->dt_ff[newfd] == NULL) {
  784                 KASSERT(newfd >= NDFDFILE);
  785                 dt->dt_ff[newfd] = ff;
  786                 ff = NULL;
  787         }
  788         fd_used(fdp, newfd);
  789         mutex_exit(&fdp->fd_lock);
  790 
  791         dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0;
  792         fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE);
  793         /* Slot is now allocated.  Insert copy of the file. */
  794         fd_affix(curproc, fp, newfd);
  795         if (ff != NULL) {
  796                 pool_cache_put(fdfile_cache, ff);
  797         }
  798         return 0;
  799 }
  800 
  801 /*
  802  * Drop reference to a file structure.
  803  */
  804 int
  805 closef(file_t *fp)
  806 {
  807         struct flock lf;
  808         int error;
  809 
  810         /*
  811          * Drop reference.  If referenced elsewhere it's still open
  812          * and we have nothing more to do.
  813          */
  814         mutex_enter(&fp->f_lock);
  815         KASSERT(fp->f_count > 0);
  816         if (--fp->f_count > 0) {
  817                 mutex_exit(&fp->f_lock);
  818                 return 0;
  819         }
  820         KASSERT(fp->f_count == 0);
  821         mutex_exit(&fp->f_lock);
  822 
  823         /* We held the last reference - release locks, close and free. */
  824         if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
  825                 lf.l_whence = SEEK_SET;
  826                 lf.l_start = 0;
  827                 lf.l_len = 0;
  828                 lf.l_type = F_UNLCK;
  829                 (void)VOP_ADVLOCK(fp->f_vnode, fp, F_UNLCK, &lf, F_FLOCK);
  830         }
  831         if (fp->f_ops != NULL) {
  832                 error = (*fp->f_ops->fo_close)(fp);
  833         } else {
  834                 error = 0;
  835         }
  836         KASSERT(fp->f_count == 0);
  837         KASSERT(fp->f_cred != NULL);
  838         pool_cache_put(file_cache, fp);
  839 
  840         return error;
  841 }
  842 
  843 /*
  844  * Allocate a file descriptor for the process.
  845  */
  846 int
  847 fd_alloc(proc_t *p, int want, int *result)
  848 {
  849         filedesc_t *fdp = p->p_fd;
  850         int i, lim, last, error, hi;
  851         u_int off;
  852         fdtab_t *dt;
  853 
  854         KASSERT(p == curproc || p == &proc0);
  855 
  856         /*
  857          * Search for a free descriptor starting at the higher
  858          * of want or fd_freefile.
  859          */
  860         mutex_enter(&fdp->fd_lock);
  861         fd_checkmaps(fdp);
  862         dt = fdp->fd_dt;
  863         KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
  864         lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
  865         last = uimin(dt->dt_nfiles, lim);
  866         for (;;) {
  867                 if ((i = want) < fdp->fd_freefile)
  868                         i = fdp->fd_freefile;
  869                 off = i >> NDENTRYSHIFT;
  870                 hi = fd_next_zero(fdp, fdp->fd_himap, off,
  871                     (last + NDENTRIES - 1) >> NDENTRYSHIFT);
  872                 if (hi == -1)
  873                         break;
  874                 i = fd_next_zero(fdp, &fdp->fd_lomap[hi],
  875                     hi > off ? 0 : i & NDENTRYMASK, NDENTRIES);
  876                 if (i == -1) {
  877                         /*
  878                          * Free file descriptor in this block was
  879                          * below want, try again with higher want.
  880                          */
  881                         want = (hi + 1) << NDENTRYSHIFT;
  882                         continue;
  883                 }
  884                 i += (hi << NDENTRYSHIFT);
  885                 if (i >= last) {
  886                         break;
  887                 }
  888                 if (dt->dt_ff[i] == NULL) {
  889                         KASSERT(i >= NDFDFILE);
  890                         dt->dt_ff[i] = pool_cache_get(fdfile_cache, PR_WAITOK);
  891                 }
  892                 KASSERT(dt->dt_ff[i]->ff_file == NULL);
  893                 fd_used(fdp, i);
  894                 if (want <= fdp->fd_freefile) {
  895                         fdp->fd_freefile = i;
  896                 }
  897                 *result = i;
  898                 KASSERT(i >= NDFDFILE ||
  899                     dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
  900                 fd_checkmaps(fdp);
  901                 mutex_exit(&fdp->fd_lock);
  902                 return 0;
  903         }
  904 
  905         /* No space in current array.  Let the caller expand and retry. */
  906         error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
  907         mutex_exit(&fdp->fd_lock);
  908         return error;
  909 }
  910 
  911 /*
  912  * Allocate memory for a descriptor table.
  913  */
  914 static fdtab_t *
  915 fd_dtab_alloc(int n)
  916 {
  917         fdtab_t *dt;
  918         size_t sz;
  919 
  920         KASSERT(n > NDFILE);
  921 
  922         sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
  923         dt = kmem_alloc(sz, KM_SLEEP);
  924 #ifdef DIAGNOSTIC
  925         memset(dt, 0xff, sz);
  926 #endif
  927         dt->dt_nfiles = n;
  928         dt->dt_link = NULL;
  929         return dt;
  930 }
  931 
  932 /*
  933  * Free a descriptor table, and all tables linked for deferred free.
  934  */
  935 static void
  936 fd_dtab_free(fdtab_t *dt)
  937 {
  938         fdtab_t *next;
  939         size_t sz;
  940 
  941         do {
  942                 next = dt->dt_link;
  943                 KASSERT(dt->dt_nfiles > NDFILE);
  944                 sz = sizeof(*dt) +
  945                     (dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
  946 #ifdef DIAGNOSTIC
  947                 memset(dt, 0xff, sz);
  948 #endif
  949                 kmem_free(dt, sz);
  950                 dt = next;
  951         } while (dt != NULL);
  952 }
  953 
  954 /*
  955  * Allocate descriptor bitmap.
  956  */
  957 static void
  958 fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
  959 {
  960         uint8_t *ptr;
  961         size_t szlo, szhi;
  962 
  963         KASSERT(n > NDENTRIES);
  964 
  965         szlo = NDLOSLOTS(n) * sizeof(uint32_t);
  966         szhi = NDHISLOTS(n) * sizeof(uint32_t);
  967         ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
  968         *lo = (uint32_t *)ptr;
  969         *hi = (uint32_t *)(ptr + szlo);
  970 }
  971 
  972 /*
  973  * Free descriptor bitmap.
  974  */
  975 static void
  976 fd_map_free(int n, uint32_t *lo, uint32_t *hi)
  977 {
  978         size_t szlo, szhi;
  979 
  980         KASSERT(n > NDENTRIES);
  981 
  982         szlo = NDLOSLOTS(n) * sizeof(uint32_t);
  983         szhi = NDHISLOTS(n) * sizeof(uint32_t);
  984         KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
  985         kmem_free(lo, szlo + szhi);
  986 }
  987 
  988 /*
  989  * Expand a process' descriptor table.
  990  */
  991 void
  992 fd_tryexpand(proc_t *p)
  993 {
  994         filedesc_t *fdp;
  995         int i, numfiles, oldnfiles;
  996         fdtab_t *newdt, *dt;
  997         uint32_t *newhimap, *newlomap;
  998 
  999         KASSERT(p == curproc || p == &proc0);
 1000 
 1001         fdp = p->p_fd;
 1002         newhimap = NULL;
 1003         newlomap = NULL;
 1004         oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles;
 1005 
 1006         if (oldnfiles < NDEXTENT)
 1007                 numfiles = NDEXTENT;
 1008         else
 1009                 numfiles = 2 * oldnfiles;
 1010 
 1011         newdt = fd_dtab_alloc(numfiles);
 1012         if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
 1013                 fd_map_alloc(numfiles, &newlomap, &newhimap);
 1014         }
 1015 
 1016         mutex_enter(&fdp->fd_lock);
 1017         dt = fdp->fd_dt;
 1018         KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
 1019         if (dt->dt_nfiles != oldnfiles) {
 1020                 /* fdp changed; caller must retry */
 1021                 mutex_exit(&fdp->fd_lock);
 1022                 fd_dtab_free(newdt);
 1023                 if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
 1024                         fd_map_free(numfiles, newlomap, newhimap);
 1025                 }
 1026                 return;
 1027         }
 1028 
 1029         /* Copy the existing descriptor table and zero the new portion. */
 1030         i = sizeof(fdfile_t *) * oldnfiles;
 1031         memcpy(newdt->dt_ff, dt->dt_ff, i);
 1032         memset((uint8_t *)newdt->dt_ff + i, 0,
 1033             numfiles * sizeof(fdfile_t *) - i);
 1034 
 1035         /*
 1036          * Link old descriptor array into list to be discarded.  We defer
 1037          * freeing until the last reference to the descriptor table goes
 1038          * away (usually process exit).  This allows us to do lockless
 1039          * lookups in fd_getfile().
 1040          */
 1041         if (oldnfiles > NDFILE) {
 1042                 if (fdp->fd_refcnt > 1) {
 1043                         newdt->dt_link = dt;
 1044                 } else {
 1045                         fd_dtab_free(dt);
 1046                 }
 1047         }
 1048 
 1049         if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
 1050                 i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
 1051                 memcpy(newhimap, fdp->fd_himap, i);
 1052                 memset((uint8_t *)newhimap + i, 0,
 1053                     NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
 1054 
 1055                 i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
 1056                 memcpy(newlomap, fdp->fd_lomap, i);
 1057                 memset((uint8_t *)newlomap + i, 0,
 1058                     NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
 1059 
 1060                 if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
 1061                         fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
 1062                 }
 1063                 fdp->fd_himap = newhimap;
 1064                 fdp->fd_lomap = newlomap;
 1065         }
 1066 
 1067         /*
 1068          * All other modifications must become globally visible before
 1069          * the change to fd_dt.  See fd_getfile().
 1070          */
 1071         atomic_store_release(&fdp->fd_dt, newdt);
 1072         KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
 1073         fd_checkmaps(fdp);
 1074         mutex_exit(&fdp->fd_lock);
 1075 }
 1076 
 1077 /*
 1078  * Create a new open file structure and allocate a file descriptor
 1079  * for the current process.
 1080  */
 1081 int
 1082 fd_allocfile(file_t **resultfp, int *resultfd)
 1083 {
 1084         proc_t *p = curproc;
 1085         kauth_cred_t cred;
 1086         file_t *fp;
 1087         int error;
 1088 
 1089         while ((error = fd_alloc(p, 0, resultfd)) != 0) {
 1090                 if (error != ENOSPC) {
 1091                         return error;
 1092                 }
 1093                 fd_tryexpand(p);
 1094         }
 1095 
 1096         fp = pool_cache_get(file_cache, PR_WAITOK);
 1097         if (fp == NULL) {
 1098                 fd_abort(p, NULL, *resultfd);
 1099                 return ENFILE;
 1100         }
 1101         KASSERT(fp->f_count == 0);
 1102         KASSERT(fp->f_msgcount == 0);
 1103         KASSERT(fp->f_unpcount == 0);
 1104 
 1105         /* Replace cached credentials if not what we need. */
 1106         cred = curlwp->l_cred;
 1107         if (__predict_false(cred != fp->f_cred)) {
 1108                 kauth_cred_free(fp->f_cred);
 1109                 kauth_cred_hold(cred);
 1110                 fp->f_cred = cred;
 1111         }
 1112 
 1113         /*
 1114          * Don't allow recycled files to be scanned.
 1115          * See uipc_usrreq.c.
 1116          */
 1117         if (__predict_false((fp->f_flag & FSCAN) != 0)) {
 1118                 mutex_enter(&fp->f_lock);
 1119                 atomic_and_uint(&fp->f_flag, ~FSCAN);
 1120                 mutex_exit(&fp->f_lock);
 1121         }
 1122 
 1123         fp->f_advice = 0;
 1124         fp->f_offset = 0;
 1125         *resultfp = fp;
 1126 
 1127         return 0;
 1128 }
 1129 
 1130 /*
 1131  * Successful creation of a new descriptor: make visible to the process.
 1132  */
 1133 void
 1134 fd_affix(proc_t *p, file_t *fp, unsigned fd)
 1135 {
 1136         fdfile_t *ff;
 1137         filedesc_t *fdp;
 1138         fdtab_t *dt;
 1139 
 1140         KASSERT(p == curproc || p == &proc0);
 1141 
 1142         /* Add a reference to the file structure. */
 1143         mutex_enter(&fp->f_lock);
 1144         fp->f_count++;
 1145         mutex_exit(&fp->f_lock);
 1146 
 1147         /*
 1148          * Insert the new file into the descriptor slot.
 1149          *
 1150          * The memory barriers provided by lock activity in this routine
 1151          * ensure that any updates to the file structure become globally
 1152          * visible before the file becomes visible to other LWPs in the
 1153          * current process; otherwise we would set ff->ff_file with
 1154          * atomic_store_release(&ff->ff_file, fp) at the bottom.
 1155          */
 1156         fdp = p->p_fd;
 1157         dt = atomic_load_consume(&fdp->fd_dt);
 1158         ff = dt->dt_ff[fd];
 1159 
 1160         KASSERT(ff != NULL);
 1161         KASSERT(ff->ff_file == NULL);
 1162         KASSERT(ff->ff_allocated);
 1163         KASSERT(fd_isused(fdp, fd));
 1164         KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
 1165 
 1166         /* No need to lock in order to make file initially visible. */
 1167         ff->ff_file = fp;
 1168 }
 1169 
 1170 /*
 1171  * Abort creation of a new descriptor: free descriptor slot and file.
 1172  */
 1173 void
 1174 fd_abort(proc_t *p, file_t *fp, unsigned fd)
 1175 {
 1176         filedesc_t *fdp;
 1177         fdfile_t *ff;
 1178 
 1179         KASSERT(p == curproc || p == &proc0);
 1180 
 1181         fdp = p->p_fd;
 1182         ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
 1183         ff->ff_exclose = false;
 1184 
 1185         KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
 1186 
 1187         mutex_enter(&fdp->fd_lock);
 1188         KASSERT(fd_isused(fdp, fd));
 1189         fd_unused(fdp, fd);
 1190         mutex_exit(&fdp->fd_lock);
 1191 
 1192         if (fp != NULL) {
 1193                 KASSERT(fp->f_count == 0);
 1194                 KASSERT(fp->f_cred != NULL);
 1195                 pool_cache_put(file_cache, fp);
 1196         }
 1197 }
 1198 
 1199 static int
 1200 file_ctor(void *arg, void *obj, int flags)
 1201 {
 1202         file_t *fp = obj;
 1203 
 1204         memset(fp, 0, sizeof(*fp));
 1205 
 1206         mutex_enter(&filelist_lock);
 1207         if (__predict_false(nfiles >= maxfiles)) {
 1208                 mutex_exit(&filelist_lock);
 1209                 tablefull("file", "increase kern.maxfiles or MAXFILES");
 1210                 return ENFILE;
 1211         }
 1212         nfiles++;
 1213         LIST_INSERT_HEAD(&filehead, fp, f_list);
 1214         mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
 1215         fp->f_cred = curlwp->l_cred;
 1216         kauth_cred_hold(fp->f_cred);
 1217         mutex_exit(&filelist_lock);
 1218 
 1219         return 0;
 1220 }
 1221 
 1222 static void
 1223 file_dtor(void *arg, void *obj)
 1224 {
 1225         file_t *fp = obj;
 1226 
 1227         mutex_enter(&filelist_lock);
 1228         nfiles--;
 1229         LIST_REMOVE(fp, f_list);
 1230         mutex_exit(&filelist_lock);
 1231 
 1232         KASSERT(fp->f_count == 0);
 1233         kauth_cred_free(fp->f_cred);
 1234         mutex_destroy(&fp->f_lock);
 1235 }
 1236 
 1237 static int
 1238 fdfile_ctor(void *arg, void *obj, int flags)
 1239 {
 1240         fdfile_t *ff = obj;
 1241 
 1242         memset(ff, 0, sizeof(*ff));
 1243         cv_init(&ff->ff_closing, "fdclose");
 1244 
 1245         return 0;
 1246 }
 1247 
 1248 static void
 1249 fdfile_dtor(void *arg, void *obj)
 1250 {
 1251         fdfile_t *ff = obj;
 1252 
 1253         cv_destroy(&ff->ff_closing);
 1254 }
 1255 
 1256 file_t *
 1257 fgetdummy(void)
 1258 {
 1259         file_t *fp;
 1260 
 1261         fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
 1262         mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
 1263         return fp;
 1264 }
 1265 
 1266 void
 1267 fputdummy(file_t *fp)
 1268 {
 1269 
 1270         mutex_destroy(&fp->f_lock);
 1271         kmem_free(fp, sizeof(*fp));
 1272 }
 1273 
 1274 /*
 1275  * Create an initial filedesc structure.
 1276  */
 1277 filedesc_t *
 1278 fd_init(filedesc_t *fdp)
 1279 {
 1280 #ifdef DIAGNOSTIC
 1281         unsigned fd;
 1282 #endif
 1283 
 1284         if (__predict_true(fdp == NULL)) {
 1285                 fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
 1286         } else {
 1287                 KASSERT(fdp == &filedesc0);
 1288                 filedesc_ctor(NULL, fdp, PR_WAITOK);
 1289         }
 1290 
 1291 #ifdef DIAGNOSTIC
 1292         KASSERT(fdp->fd_lastfile == -1);
 1293         KASSERT(fdp->fd_lastkqfile == -1);
 1294         KASSERT(fdp->fd_knhash == NULL);
 1295         KASSERT(fdp->fd_freefile == 0);
 1296         KASSERT(fdp->fd_exclose == false);
 1297         KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
 1298         KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
 1299         for (fd = 0; fd < NDFDFILE; fd++) {
 1300                 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
 1301                     (fdfile_t *)fdp->fd_dfdfile[fd]);
 1302         }
 1303         for (fd = NDFDFILE; fd < NDFILE; fd++) {
 1304                 KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
 1305         }
 1306         KASSERT(fdp->fd_himap == fdp->fd_dhimap);
 1307         KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
 1308 #endif  /* DIAGNOSTIC */
 1309 
 1310         fdp->fd_refcnt = 1;
 1311         fd_checkmaps(fdp);
 1312 
 1313         return fdp;
 1314 }
 1315 
 1316 /*
 1317  * Initialize a file descriptor table.
 1318  */
 1319 static int
 1320 filedesc_ctor(void *arg, void *obj, int flag)
 1321 {
 1322         filedesc_t *fdp = obj;
 1323         fdfile_t **ffp;
 1324         int i;
 1325 
 1326         memset(fdp, 0, sizeof(*fdp));
 1327         mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
 1328         fdp->fd_lastfile = -1;
 1329         fdp->fd_lastkqfile = -1;
 1330         fdp->fd_dt = &fdp->fd_dtbuiltin;
 1331         fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
 1332         fdp->fd_himap = fdp->fd_dhimap;
 1333         fdp->fd_lomap = fdp->fd_dlomap;
 1334 
 1335         CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
 1336         for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
 1337                 *ffp = (fdfile_t *)fdp->fd_dfdfile[i];
 1338                 (void)fdfile_ctor(NULL, fdp->fd_dfdfile[i], PR_WAITOK);
 1339         }
 1340 
 1341         return 0;
 1342 }
 1343 
 1344 static void
 1345 filedesc_dtor(void *arg, void *obj)
 1346 {
 1347         filedesc_t *fdp = obj;
 1348         int i;
 1349 
 1350         for (i = 0; i < NDFDFILE; i++) {
 1351                 fdfile_dtor(NULL, fdp->fd_dfdfile[i]);
 1352         }
 1353 
 1354         mutex_destroy(&fdp->fd_lock);
 1355 }
 1356 
 1357 /*
 1358  * Make p share curproc's filedesc structure.
 1359  */
 1360 void
 1361 fd_share(struct proc *p)
 1362 {
 1363         filedesc_t *fdp;
 1364 
 1365         fdp = curlwp->l_fd;
 1366         p->p_fd = fdp;
 1367         atomic_inc_uint(&fdp->fd_refcnt);
 1368 }
 1369 
 1370 /*
 1371  * Acquire a hold on a filedesc structure.
 1372  */
 1373 void
 1374 fd_hold(lwp_t *l)
 1375 {
 1376         filedesc_t *fdp = l->l_fd;
 1377 
 1378         atomic_inc_uint(&fdp->fd_refcnt);
 1379 }
 1380 
 1381 /*
 1382  * Copy a filedesc structure.
 1383  */
 1384 filedesc_t *
 1385 fd_copy(void)
 1386 {
 1387         filedesc_t *newfdp, *fdp;
 1388         fdfile_t *ff, **ffp, **nffp, *ff2;
 1389         int i, j, numfiles, lastfile, newlast;
 1390         file_t *fp;
 1391         fdtab_t *newdt;
 1392 
 1393         fdp = curproc->p_fd;
 1394         newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
 1395         newfdp->fd_refcnt = 1;
 1396 
 1397 #ifdef DIAGNOSTIC
 1398         KASSERT(newfdp->fd_lastfile == -1);
 1399         KASSERT(newfdp->fd_lastkqfile == -1);
 1400         KASSERT(newfdp->fd_knhash == NULL);
 1401         KASSERT(newfdp->fd_freefile == 0);
 1402         KASSERT(newfdp->fd_exclose == false);
 1403         KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
 1404         KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
 1405         for (i = 0; i < NDFDFILE; i++) {
 1406                 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
 1407                     (fdfile_t *)&newfdp->fd_dfdfile[i]);
 1408         }
 1409         for (i = NDFDFILE; i < NDFILE; i++) {
 1410                 KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
 1411         }
 1412 #endif  /* DIAGNOSTIC */
 1413 
 1414         mutex_enter(&fdp->fd_lock);
 1415         fd_checkmaps(fdp);
 1416         numfiles = fdp->fd_dt->dt_nfiles;
 1417         lastfile = fdp->fd_lastfile;
 1418 
 1419         /*
 1420          * If the number of open files fits in the internal arrays
 1421          * of the open file structure, use them, otherwise allocate
 1422          * additional memory for the number of descriptors currently
 1423          * in use.
 1424          */
 1425         if (lastfile < NDFILE) {
 1426                 i = NDFILE;
 1427                 newdt = newfdp->fd_dt;
 1428                 KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
 1429         } else {
 1430                 /*
 1431                  * Compute the smallest multiple of NDEXTENT needed
 1432                  * for the file descriptors currently in use,
 1433                  * allowing the table to shrink.
 1434                  */
 1435                 i = numfiles;
 1436                 while (i >= 2 * NDEXTENT && i > lastfile * 2) {
 1437                         i /= 2;
 1438                 }
 1439                 KASSERT(i > NDFILE);
 1440                 newdt = fd_dtab_alloc(i);
 1441                 newfdp->fd_dt = newdt;
 1442                 memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
 1443                     NDFDFILE * sizeof(fdfile_t **));
 1444                 memset(newdt->dt_ff + NDFDFILE, 0,
 1445                     (i - NDFDFILE) * sizeof(fdfile_t **));
 1446         }
 1447         if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
 1448                 newfdp->fd_himap = newfdp->fd_dhimap;
 1449                 newfdp->fd_lomap = newfdp->fd_dlomap;
 1450         } else {
 1451                 fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
 1452                 KASSERT(i >= NDENTRIES * NDENTRIES);
 1453                 memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
 1454                 memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
 1455         }
 1456         newfdp->fd_freefile = fdp->fd_freefile;
 1457         newfdp->fd_exclose = fdp->fd_exclose;
 1458 
 1459         ffp = fdp->fd_dt->dt_ff;
 1460         nffp = newdt->dt_ff;
 1461         newlast = -1;
 1462         for (i = 0; i <= lastfile; i++, ffp++, nffp++) {
 1463                 KASSERT(i >= NDFDFILE ||
 1464                     *nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
 1465                 ff = *ffp;
 1466                 if (ff == NULL ||
 1467                     (fp = atomic_load_consume(&ff->ff_file)) == NULL) {
 1468                         /* Descriptor unused, or descriptor half open. */
 1469                         KASSERT(!fd_isused(newfdp, i));
 1470                         continue;
 1471                 }
 1472                 if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
 1473                         /* kqueue descriptors cannot be copied. */
 1474                         if (i < newfdp->fd_freefile) {
 1475                                 newfdp->fd_freefile = i;
 1476                         }
 1477                         continue;
 1478                 }
 1479                 /* It's active: add a reference to the file. */
 1480                 mutex_enter(&fp->f_lock);
 1481                 fp->f_count++;
 1482                 mutex_exit(&fp->f_lock);
 1483 
 1484                 /* Allocate an fdfile_t to represent it. */
 1485                 if (i >= NDFDFILE) {
 1486                         ff2 = pool_cache_get(fdfile_cache, PR_WAITOK);
 1487                         *nffp = ff2;
 1488                 } else {
 1489                         ff2 = newdt->dt_ff[i];
 1490                 }
 1491                 ff2->ff_file = fp;
 1492                 ff2->ff_exclose = ff->ff_exclose;
 1493                 ff2->ff_allocated = true;
 1494 
 1495                 /* Fix up bitmaps. */
 1496                 j = i >> NDENTRYSHIFT;
 1497                 KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0);
 1498                 newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK);
 1499                 if (__predict_false(newfdp->fd_lomap[j] == ~0)) {
 1500                         KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
 1501                             (1U << (j & NDENTRYMASK))) == 0);
 1502                         newfdp->fd_himap[j >> NDENTRYSHIFT] |=
 1503                             1U << (j & NDENTRYMASK);
 1504                 }
 1505                 newlast = i;
 1506         }
 1507         KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
 1508         newfdp->fd_lastfile = newlast;
 1509         fd_checkmaps(newfdp);
 1510         mutex_exit(&fdp->fd_lock);
 1511 
 1512         return newfdp;
 1513 }
 1514 
 1515 /*
 1516  * Release a filedesc structure.
 1517  */
 1518 void
 1519 fd_free(void)
 1520 {
 1521         fdfile_t *ff;
 1522         file_t *fp;
 1523         int fd, nf;
 1524         fdtab_t *dt;
 1525         lwp_t * const l = curlwp;
 1526         filedesc_t * const fdp = l->l_fd;
 1527         const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
 1528 
 1529         KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] ==
 1530             (fdfile_t *)fdp->fd_dfdfile[0]);
 1531         KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
 1532         KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
 1533 
 1534 #ifndef __HAVE_ATOMIC_AS_MEMBAR
 1535         membar_exit();
 1536 #endif
 1537         if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
 1538                 return;
 1539 
 1540         /*
 1541          * Close any files that the process holds open.
 1542          */
 1543         dt = fdp->fd_dt;
 1544         fd_checkmaps(fdp);
 1545 #ifdef DEBUG
 1546         fdp->fd_refcnt = -1; /* see fd_checkmaps */
 1547 #endif
 1548         for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
 1549                 ff = dt->dt_ff[fd];
 1550                 KASSERT(fd >= NDFDFILE ||
 1551                     ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
 1552                 if (ff == NULL)
 1553                         continue;
 1554                 if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) {
 1555                         /*
 1556                          * Must use fd_close() here if there is
 1557                          * a reference from kqueue or we might have posix
 1558                          * advisory locks.
 1559                          */
 1560                         if (__predict_true(ff->ff_refcnt == 0) &&
 1561                             (noadvlock || fp->f_type != DTYPE_VNODE)) {
 1562                                 ff->ff_file = NULL;
 1563                                 ff->ff_exclose = false;
 1564                                 ff->ff_allocated = false;
 1565                                 closef(fp);
 1566                         } else {
 1567                                 ff->ff_refcnt++;
 1568                                 fd_close(fd);
 1569                         }
 1570                 }
 1571                 KASSERT(ff->ff_refcnt == 0);
 1572                 KASSERT(ff->ff_file == NULL);
 1573                 KASSERT(!ff->ff_exclose);
 1574                 KASSERT(!ff->ff_allocated);
 1575                 if (fd >= NDFDFILE) {
 1576                         pool_cache_put(fdfile_cache, ff);
 1577                         dt->dt_ff[fd] = NULL;
 1578                 }
 1579         }
 1580 
 1581         /*
 1582          * Clean out the descriptor table for the next user and return
 1583          * to the cache.
 1584          */
 1585         if (__predict_false(dt != &fdp->fd_dtbuiltin)) {
 1586                 fd_dtab_free(fdp->fd_dt);
 1587                 /* Otherwise, done above. */
 1588                 memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
 1589                     (NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
 1590                 fdp->fd_dt = &fdp->fd_dtbuiltin;
 1591         }
 1592         if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) {
 1593                 KASSERT(fdp->fd_himap != fdp->fd_dhimap);
 1594                 KASSERT(fdp->fd_lomap != fdp->fd_dlomap);
 1595                 fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
 1596         }
 1597         if (__predict_false(fdp->fd_knhash != NULL)) {
 1598                 hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
 1599                 fdp->fd_knhash = NULL;
 1600                 fdp->fd_knhashmask = 0;
 1601         } else {
 1602                 KASSERT(fdp->fd_knhashmask == 0);
 1603         }
 1604         fdp->fd_dt = &fdp->fd_dtbuiltin;
 1605         fdp->fd_lastkqfile = -1;
 1606         fdp->fd_lastfile = -1;
 1607         fdp->fd_freefile = 0;
 1608         fdp->fd_exclose = false;
 1609         memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
 1610             offsetof(filedesc_t, fd_startzero));
 1611         fdp->fd_himap = fdp->fd_dhimap;
 1612         fdp->fd_lomap = fdp->fd_dlomap;
 1613         KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
 1614         KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
 1615         KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
 1616 #ifdef DEBUG
 1617         fdp->fd_refcnt = 0; /* see fd_checkmaps */
 1618 #endif
 1619         fd_checkmaps(fdp);
 1620         pool_cache_put(filedesc_cache, fdp);
 1621 }
 1622 
 1623 /*
 1624  * File Descriptor pseudo-device driver (/dev/fd/).
 1625  *
 1626  * Opening minor device N dup()s the file (if any) connected to file
 1627  * descriptor N belonging to the calling process.  Note that this driver
 1628  * consists of only the ``open()'' routine, because all subsequent
 1629  * references to this file will be direct to the other driver.
 1630  */
 1631 static int
 1632 filedescopen(dev_t dev, int mode, int type, lwp_t *l)
 1633 {
 1634 
 1635         /*
 1636          * XXX Kludge: set dupfd to contain the value of the
 1637          * the file descriptor being sought for duplication. The error
 1638          * return ensures that the vnode for this device will be released
 1639          * by vn_open. Open will detect this special error and take the
 1640          * actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
 1641          * will simply report the error.
 1642          */
 1643         l->l_dupfd = minor(dev);        /* XXX */
 1644         return EDUPFD;
 1645 }
 1646 
 1647 /*
 1648  * Duplicate the specified descriptor to a free descriptor.
 1649  *
 1650  * old is the original fd.
 1651  * moveit is true if we should move rather than duplicate.
 1652  * flags are the open flags (converted from O_* to F*).
 1653  * newp returns the new fd on success.
 1654  *
 1655  * These two cases are produced by the EDUPFD and EMOVEFD magic
 1656  * errnos, but in the interest of removing that regrettable interface,
 1657  * vn_open has been changed to intercept them. Now vn_open returns
 1658  * either a vnode or a filehandle, and the filehandle is accompanied
 1659  * by a boolean that says whether we should dup (moveit == false) or
 1660  * move (moveit == true) the fd.
 1661  *
 1662  * The dup case is used by /dev/stderr, /proc/self/fd, and such. The
 1663  * move case is used by cloner devices that allocate a fd of their
 1664  * own (a layering violation that should go away eventually) that
 1665  * then needs to be put in the place open() expects it.
 1666  */
 1667 int
 1668 fd_dupopen(int old, bool moveit, int flags, int *newp)
 1669 {
 1670         filedesc_t *fdp;
 1671         fdfile_t *ff;
 1672         file_t *fp;
 1673         fdtab_t *dt;
 1674         int error;
 1675 
 1676         if ((fp = fd_getfile(old)) == NULL) {
 1677                 return EBADF;
 1678         }
 1679         fdp = curlwp->l_fd;
 1680         dt = atomic_load_consume(&fdp->fd_dt);
 1681         ff = dt->dt_ff[old];
 1682 
 1683         /*
 1684          * There are two cases of interest here.
 1685          *
 1686          * 1. moveit == false (used to be the EDUPFD magic errno):
 1687          *    simply dup (old) to file descriptor (new) and return.
 1688          *
 1689          * 2. moveit == true (used to be the EMOVEFD magic errno):
 1690          *    steal away the file structure from (old) and store it in
 1691          *    (new).  (old) is effectively closed by this operation.
 1692          */
 1693         if (moveit == false) {
 1694                 /*
 1695                  * Check that the mode the file is being opened for is a
 1696                  * subset of the mode of the existing descriptor.
 1697                  */
 1698                 if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 1699                         error = EACCES;
 1700                         goto out;
 1701                 }
 1702 
 1703                 /* Copy it. */
 1704                 error = fd_dup(fp, 0, newp, ff->ff_exclose);
 1705         } else {
 1706                 /* Copy it. */
 1707                 error = fd_dup(fp, 0, newp, ff->ff_exclose);
 1708                 if (error != 0) {
 1709                         goto out;
 1710                 }
 1711 
 1712                 /* Steal away the file pointer from 'old'. */
 1713                 (void)fd_close(old);
 1714                 return 0;
 1715         }
 1716 
 1717 out:
 1718         fd_putfile(old);
 1719         return error;
 1720 }
 1721 
 1722 /*
 1723  * Close open files on exec.
 1724  */
 1725 void
 1726 fd_closeexec(void)
 1727 {
 1728         proc_t *p;
 1729         filedesc_t *fdp;
 1730         fdfile_t *ff;
 1731         lwp_t *l;
 1732         fdtab_t *dt;
 1733         int fd;
 1734 
 1735         l = curlwp;
 1736         p = l->l_proc;
 1737         fdp = p->p_fd;
 1738 
 1739         if (fdp->fd_refcnt > 1) {
 1740                 fdp = fd_copy();
 1741                 fd_free();
 1742                 p->p_fd = fdp;
 1743                 l->l_fd = fdp;
 1744         }
 1745         if (!fdp->fd_exclose) {
 1746                 return;
 1747         }
 1748         fdp->fd_exclose = false;
 1749         dt = atomic_load_consume(&fdp->fd_dt);
 1750 
 1751         for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 1752                 if ((ff = dt->dt_ff[fd]) == NULL) {
 1753                         KASSERT(fd >= NDFDFILE);
 1754                         continue;
 1755                 }
 1756                 KASSERT(fd >= NDFDFILE ||
 1757                     ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
 1758                 if (ff->ff_file == NULL)
 1759                         continue;
 1760                 if (ff->ff_exclose) {
 1761                         /*
 1762                          * We need a reference to close the file.
 1763                          * No other threads can see the fdfile_t at
 1764                          * this point, so don't bother locking.
 1765                          */
 1766                         KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
 1767                         ff->ff_refcnt++;
 1768                         fd_close(fd);
 1769                 }
 1770         }
 1771 }
 1772 
 1773 /*
 1774  * Sets descriptor owner. If the owner is a process, 'pgid'
 1775  * is set to positive value, process ID. If the owner is process group,
 1776  * 'pgid' is set to -pg_id.
 1777  */
 1778 int
 1779 fsetown(pid_t *pgid, u_long cmd, const void *data)
 1780 {
 1781         pid_t id = *(const pid_t *)data;
 1782         int error;
 1783 
 1784         if (id == INT_MIN)
 1785                 return EINVAL;
 1786 
 1787         switch (cmd) {
 1788         case TIOCSPGRP:
 1789                 if (id < 0)
 1790                         return EINVAL;
 1791                 id = -id;
 1792                 break;
 1793         default:
 1794                 break;
 1795         }
 1796         if (id > 0) {
 1797                 mutex_enter(&proc_lock);
 1798                 error = proc_find(id) ? 0 : ESRCH;
 1799                 mutex_exit(&proc_lock);
 1800         } else if (id < 0) {
 1801                 error = pgid_in_session(curproc, -id);
 1802         } else {
 1803                 error = 0;
 1804         }
 1805         if (!error) {
 1806                 *pgid = id;
 1807         }
 1808         return error;
 1809 }
 1810 
 1811 void
 1812 fd_set_exclose(struct lwp *l, int fd, bool exclose)
 1813 {
 1814         filedesc_t *fdp = l->l_fd;
 1815         fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
 1816 
 1817         ff->ff_exclose = exclose;
 1818         if (exclose)
 1819                 fdp->fd_exclose = true;
 1820 }
 1821 
 1822 /*
 1823  * Return descriptor owner information. If the value is positive,
 1824  * it's process ID. If it's negative, it's process group ID and
 1825  * needs the sign removed before use.
 1826  */
 1827 int
 1828 fgetown(pid_t pgid, u_long cmd, void *data)
 1829 {
 1830 
 1831         switch (cmd) {
 1832         case TIOCGPGRP:
 1833                 *(int *)data = -pgid;
 1834                 break;
 1835         default:
 1836                 *(int *)data = pgid;
 1837                 break;
 1838         }
 1839         return 0;
 1840 }
 1841 
 1842 /*
 1843  * Send signal to descriptor owner, either process or process group.
 1844  */
 1845 void
 1846 fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
 1847 {
 1848         ksiginfo_t ksi;
 1849 
 1850         KASSERT(!cpu_intr_p());
 1851 
 1852         if (pgid == 0) {
 1853                 return;
 1854         }
 1855 
 1856         KSI_INIT(&ksi);
 1857         ksi.ksi_signo = signo;
 1858         ksi.ksi_code = code;
 1859         ksi.ksi_band = band;
 1860 
 1861         mutex_enter(&proc_lock);
 1862         if (pgid > 0) {
 1863                 struct proc *p1;
 1864 
 1865                 p1 = proc_find(pgid);
 1866                 if (p1 != NULL) {
 1867                         kpsignal(p1, &ksi, fdescdata);
 1868                 }
 1869         } else {
 1870                 struct pgrp *pgrp;
 1871 
 1872                 KASSERT(pgid < 0);
 1873                 pgrp = pgrp_find(-pgid);
 1874                 if (pgrp != NULL) {
 1875                         kpgsignal(pgrp, &ksi, fdescdata, 0);
 1876                 }
 1877         }
 1878         mutex_exit(&proc_lock);
 1879 }
 1880 
 1881 int
 1882 fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
 1883          void *data)
 1884 {
 1885         fdfile_t *ff;
 1886         filedesc_t *fdp;
 1887 
 1888         fp->f_flag = flag & FMASK;
 1889         fdp = curproc->p_fd;
 1890         ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
 1891         KASSERT(ff != NULL);
 1892         ff->ff_exclose = (flag & O_CLOEXEC) != 0;
 1893         fp->f_type = DTYPE_MISC;
 1894         fp->f_ops = fops;
 1895         fp->f_data = data;
 1896         curlwp->l_dupfd = fd;
 1897         fd_affix(curproc, fp, fd);
 1898 
 1899         return EMOVEFD;
 1900 }
 1901 
 1902 int
 1903 fnullop_fcntl(file_t *fp, u_int cmd, void *data)
 1904 {
 1905 
 1906         if (cmd == F_SETFL)
 1907                 return 0;
 1908 
 1909         return EOPNOTSUPP;
 1910 }
 1911 
 1912 int
 1913 fnullop_poll(file_t *fp, int which)
 1914 {
 1915 
 1916         return 0;
 1917 }
 1918 
 1919 int
 1920 fnullop_kqfilter(file_t *fp, struct knote *kn)
 1921 {
 1922 
 1923         return EOPNOTSUPP;
 1924 }
 1925 
 1926 void
 1927 fnullop_restart(file_t *fp)
 1928 {
 1929 
 1930 }
 1931 
 1932 int
 1933 fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
 1934             kauth_cred_t cred, int flags)
 1935 {
 1936 
 1937         return EOPNOTSUPP;
 1938 }
 1939 
 1940 int
 1941 fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
 1942              kauth_cred_t cred, int flags)
 1943 {
 1944 
 1945         return EOPNOTSUPP;
 1946 }
 1947 
 1948 int
 1949 fbadop_ioctl(file_t *fp, u_long com, void *data)
 1950 {
 1951 
 1952         return EOPNOTSUPP;
 1953 }
 1954 
 1955 int
 1956 fbadop_stat(file_t *fp, struct stat *sb)
 1957 {
 1958 
 1959         return EOPNOTSUPP;
 1960 }
 1961 
 1962 int
 1963 fbadop_close(file_t *fp)
 1964 {
 1965 
 1966         return EOPNOTSUPP;
 1967 }
 1968 
 1969 /*
 1970  * sysctl routines pertaining to file descriptors
 1971  */
 1972 
 1973 /* Initialized in sysctl_init() for now... */
 1974 extern kmutex_t sysctl_file_marker_lock;
 1975 static u_int sysctl_file_marker = 1;
 1976 
 1977 /*
 1978  * Expects to be called with proc_lock and sysctl_file_marker_lock locked.
 1979  */
 1980 static void
 1981 sysctl_file_marker_reset(void)
 1982 {
 1983         struct proc *p;
 1984 
 1985         PROCLIST_FOREACH(p, &allproc) {
 1986                 struct filedesc *fd = p->p_fd;
 1987                 fdtab_t *dt;
 1988                 u_int i;
 1989 
 1990                 mutex_enter(&fd->fd_lock);
 1991                 dt = fd->fd_dt;
 1992                 for (i = 0; i < dt->dt_nfiles; i++) {
 1993                         struct file *fp;
 1994                         fdfile_t *ff;
 1995 
 1996                         if ((ff = dt->dt_ff[i]) == NULL) {
 1997                                 continue;
 1998                         }
 1999                         if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
 2000                                 continue;
 2001                         }
 2002                         fp->f_marker = 0;
 2003                 }
 2004                 mutex_exit(&fd->fd_lock);
 2005         }
 2006 }
 2007 
 2008 /*
 2009  * sysctl helper routine for kern.file pseudo-subtree.
 2010  */
 2011 static int
 2012 sysctl_kern_file(SYSCTLFN_ARGS)
 2013 {
 2014         const bool allowaddr = get_expose_address(curproc);
 2015         struct filelist flist;
 2016         int error;
 2017         size_t buflen;
 2018         struct file *fp, fbuf;
 2019         char *start, *where;
 2020         struct proc *p;
 2021 
 2022         start = where = oldp;
 2023         buflen = *oldlenp;
 2024         
 2025         if (where == NULL) {
 2026                 /*
 2027                  * overestimate by 10 files
 2028                  */
 2029                 *oldlenp = sizeof(filehead) + (nfiles + 10) *
 2030                     sizeof(struct file);
 2031                 return 0;
 2032         }
 2033 
 2034         /*
 2035          * first sysctl_copyout filehead
 2036          */
 2037         if (buflen < sizeof(filehead)) {
 2038                 *oldlenp = 0;
 2039                 return 0;
 2040         }
 2041         sysctl_unlock();
 2042         if (allowaddr) {
 2043                 memcpy(&flist, &filehead, sizeof(flist));
 2044         } else {
 2045                 memset(&flist, 0, sizeof(flist));
 2046         }
 2047         error = sysctl_copyout(l, &flist, where, sizeof(flist));
 2048         if (error) {
 2049                 sysctl_relock();
 2050                 return error;
 2051         }
 2052         buflen -= sizeof(flist);
 2053         where += sizeof(flist);
 2054 
 2055         /*
 2056          * followed by an array of file structures
 2057          */
 2058         mutex_enter(&sysctl_file_marker_lock);
 2059         mutex_enter(&proc_lock);
 2060         PROCLIST_FOREACH(p, &allproc) {
 2061                 struct filedesc *fd;
 2062                 fdtab_t *dt;
 2063                 u_int i;
 2064 
 2065                 if (p->p_stat == SIDL) {
 2066                         /* skip embryonic processes */
 2067                         continue;
 2068                 }
 2069                 mutex_enter(p->p_lock);
 2070                 error = kauth_authorize_process(l->l_cred,
 2071                     KAUTH_PROCESS_CANSEE, p,
 2072                     KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
 2073                     NULL, NULL);
 2074                 mutex_exit(p->p_lock);
 2075                 if (error != 0) {
 2076                         /*
 2077                          * Don't leak kauth retval if we're silently
 2078                          * skipping this entry.
 2079                          */
 2080                         error = 0;
 2081                         continue;
 2082                 }
 2083 
 2084                 /*
 2085                  * Grab a hold on the process.
 2086                  */
 2087                 if (!rw_tryenter(&p->p_reflock, RW_READER)) {
 2088                         continue;
 2089                 }
 2090                 mutex_exit(&proc_lock);
 2091 
 2092                 fd = p->p_fd;
 2093                 mutex_enter(&fd->fd_lock);
 2094                 dt = fd->fd_dt;
 2095                 for (i = 0; i < dt->dt_nfiles; i++) {
 2096                         fdfile_t *ff;
 2097 
 2098                         if ((ff = dt->dt_ff[i]) == NULL) {
 2099                                 continue;
 2100                         }
 2101                         if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
 2102                                 continue;
 2103                         }
 2104 
 2105                         mutex_enter(&fp->f_lock);
 2106 
 2107                         if ((fp->f_count == 0) ||
 2108                             (fp->f_marker == sysctl_file_marker)) {
 2109                                 mutex_exit(&fp->f_lock);
 2110                                 continue;
 2111                         }
 2112 
 2113                         /* Check that we have enough space. */
 2114                         if (buflen < sizeof(struct file)) {
 2115                                 *oldlenp = where - start;
 2116                                 mutex_exit(&fp->f_lock);
 2117                                 error = ENOMEM;
 2118                                 break;
 2119                         }
 2120 
 2121                         fill_file(&fbuf, fp);
 2122                         mutex_exit(&fp->f_lock);
 2123                         error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
 2124                         if (error) {
 2125                                 break;
 2126                         }
 2127                         buflen -= sizeof(struct file);
 2128                         where += sizeof(struct file);
 2129 
 2130                         fp->f_marker = sysctl_file_marker;
 2131                 }
 2132                 mutex_exit(&fd->fd_lock);
 2133 
 2134                 /*
 2135                  * Release reference to process.
 2136                  */
 2137                 mutex_enter(&proc_lock);
 2138                 rw_exit(&p->p_reflock);
 2139 
 2140                 if (error)
 2141                         break;
 2142         }
 2143 
 2144         sysctl_file_marker++;
 2145         /* Reset all markers if wrapped. */
 2146         if (sysctl_file_marker == 0) {
 2147                 sysctl_file_marker_reset();
 2148                 sysctl_file_marker++;
 2149         }
 2150 
 2151         mutex_exit(&proc_lock);
 2152         mutex_exit(&sysctl_file_marker_lock);
 2153 
 2154         *oldlenp = where - start;
 2155         sysctl_relock();
 2156         return error;
 2157 }
 2158 
 2159 /*
 2160  * sysctl helper function for kern.file2
 2161  */
 2162 static int
 2163 sysctl_kern_file2(SYSCTLFN_ARGS)
 2164 {
 2165         struct proc *p;
 2166         struct file *fp;
 2167         struct filedesc *fd;
 2168         struct kinfo_file kf;
 2169         char *dp;
 2170         u_int i, op;
 2171         size_t len, needed, elem_size, out_size;
 2172         int error, arg, elem_count;
 2173         fdfile_t *ff;
 2174         fdtab_t *dt;
 2175 
 2176         if (namelen == 1 && name[0] == CTL_QUERY)
 2177                 return sysctl_query(SYSCTLFN_CALL(rnode));
 2178 
 2179         if (namelen != 4)
 2180                 return EINVAL;
 2181 
 2182         error = 0;
 2183         dp = oldp;
 2184         len = (oldp != NULL) ? *oldlenp : 0;
 2185         op = name[0];
 2186         arg = name[1];
 2187         elem_size = name[2];
 2188         elem_count = name[3];
 2189         out_size = MIN(sizeof(kf), elem_size);
 2190         needed = 0;
 2191 
 2192         if (elem_size < 1 || elem_count < 0)
 2193                 return EINVAL;
 2194 
 2195         switch (op) {
 2196         case KERN_FILE_BYFILE:
 2197         case KERN_FILE_BYPID:
 2198                 /*
 2199                  * We're traversing the process list in both cases; the BYFILE
 2200                  * case does additional work of keeping track of files already
 2201                  * looked at.
 2202                  */
 2203 
 2204                 /* doesn't use arg so it must be zero */
 2205                 if ((op == KERN_FILE_BYFILE) && (arg != 0))
 2206                         return EINVAL;
 2207 
 2208                 if ((op == KERN_FILE_BYPID) && (arg < -1))
 2209                         /* -1 means all processes */
 2210                         return EINVAL;
 2211 
 2212                 sysctl_unlock();
 2213                 if (op == KERN_FILE_BYFILE)
 2214                         mutex_enter(&sysctl_file_marker_lock);
 2215                 mutex_enter(&proc_lock);
 2216                 PROCLIST_FOREACH(p, &allproc) {
 2217                         if (p->p_stat == SIDL) {
 2218                                 /* skip embryonic processes */
 2219                                 continue;
 2220                         }
 2221                         if (arg > 0 && p->p_pid != arg) {
 2222                                 /* pick only the one we want */
 2223                                 /* XXX want 0 to mean "kernel files" */
 2224                                 continue;
 2225                         }
 2226                         mutex_enter(p->p_lock);
 2227                         error = kauth_authorize_process(l->l_cred,
 2228                             KAUTH_PROCESS_CANSEE, p,
 2229                             KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
 2230                             NULL, NULL);
 2231                         mutex_exit(p->p_lock);
 2232                         if (error != 0) {
 2233                                 /*
 2234                                  * Don't leak kauth retval if we're silently
 2235                                  * skipping this entry.
 2236                                  */
 2237                                 error = 0;
 2238                                 continue;
 2239                         }
 2240 
 2241                         /*
 2242                          * Grab a hold on the process.
 2243                          */
 2244                         if (!rw_tryenter(&p->p_reflock, RW_READER)) {
 2245                                 continue;
 2246                         }
 2247                         mutex_exit(&proc_lock);
 2248 
 2249                         fd = p->p_fd;
 2250                         mutex_enter(&fd->fd_lock);
 2251                         dt = fd->fd_dt;
 2252                         for (i = 0; i < dt->dt_nfiles; i++) {
 2253                                 if ((ff = dt->dt_ff[i]) == NULL) {
 2254                                         continue;
 2255                                 }
 2256                                 if ((fp = atomic_load_consume(&ff->ff_file)) ==
 2257                                     NULL) {
 2258                                         continue;
 2259                                 }
 2260 
 2261                                 if ((op == KERN_FILE_BYFILE) &&
 2262                                     (fp->f_marker == sysctl_file_marker)) {
 2263                                         continue;
 2264                                 }
 2265                                 if (len >= elem_size && elem_count > 0) {
 2266                                         mutex_enter(&fp->f_lock);
 2267                                         fill_file2(&kf, fp, ff, i, p->p_pid);
 2268                                         mutex_exit(&fp->f_lock);
 2269                                         mutex_exit(&fd->fd_lock);
 2270                                         error = sysctl_copyout(l,
 2271                                             &kf, dp, out_size);
 2272                                         mutex_enter(&fd->fd_lock);
 2273                                         if (error)
 2274                                                 break;
 2275                                         dp += elem_size;
 2276                                         len -= elem_size;
 2277                                 }
 2278                                 if (op == KERN_FILE_BYFILE)
 2279                                         fp->f_marker = sysctl_file_marker;
 2280                                 needed += elem_size;
 2281                                 if (elem_count > 0 && elem_count != INT_MAX)
 2282                                         elem_count--;
 2283                         }
 2284                         mutex_exit(&fd->fd_lock);
 2285 
 2286                         /*
 2287                          * Release reference to process.
 2288                          */
 2289                         mutex_enter(&proc_lock);
 2290                         rw_exit(&p->p_reflock);
 2291                 }
 2292                 if (op == KERN_FILE_BYFILE) {
 2293                         sysctl_file_marker++;
 2294 
 2295                         /* Reset all markers if wrapped. */
 2296                         if (sysctl_file_marker == 0) {
 2297                                 sysctl_file_marker_reset();
 2298                                 sysctl_file_marker++;
 2299                         }
 2300                 }
 2301                 mutex_exit(&proc_lock);
 2302                 if (op == KERN_FILE_BYFILE)
 2303                         mutex_exit(&sysctl_file_marker_lock);
 2304                 sysctl_relock();
 2305                 break;
 2306         default:
 2307                 return EINVAL;
 2308         }
 2309 
 2310         if (oldp == NULL)
 2311                 needed += KERN_FILESLOP * elem_size;
 2312         *oldlenp = needed;
 2313 
 2314         return error;
 2315 }
 2316 
 2317 static void
 2318 fill_file(struct file *fp, const struct file *fpsrc)
 2319 {
 2320         const bool allowaddr = get_expose_address(curproc);
 2321 
 2322         memset(fp, 0, sizeof(*fp));
 2323 
 2324         fp->f_offset = fpsrc->f_offset;
 2325         COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr);
 2326         COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr);
 2327         COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr);
 2328         COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr);
 2329         fp->f_flag = fpsrc->f_flag;
 2330         fp->f_marker = fpsrc->f_marker;
 2331         fp->f_type = fpsrc->f_type;
 2332         fp->f_advice = fpsrc->f_advice;
 2333         fp->f_count = fpsrc->f_count;
 2334         fp->f_msgcount = fpsrc->f_msgcount;
 2335         fp->f_unpcount = fpsrc->f_unpcount;
 2336         COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr);
 2337 }
 2338 
 2339 static void
 2340 fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
 2341           int i, pid_t pid)
 2342 {
 2343         const bool allowaddr = get_expose_address(curproc);
 2344 
 2345         memset(kp, 0, sizeof(*kp));
 2346 
 2347         COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr);
 2348         kp->ki_flag =           fp->f_flag;
 2349         kp->ki_iflags =         0;
 2350         kp->ki_ftype =          fp->f_type;
 2351         kp->ki_count =          fp->f_count;
 2352         kp->ki_msgcount =       fp->f_msgcount;
 2353         COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr);
 2354         kp->ki_fuid =           kauth_cred_geteuid(fp->f_cred);
 2355         kp->ki_fgid =           kauth_cred_getegid(fp->f_cred);
 2356         COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr);
 2357         kp->ki_foffset =        fp->f_offset;
 2358         COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr);
 2359 
 2360         /* vnode information to glue this file to something */
 2361         if (fp->f_type == DTYPE_VNODE) {
 2362                 struct vnode *vp = fp->f_vnode;
 2363 
 2364                 COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket),
 2365                     allowaddr);
 2366                 kp->ki_vsize =  vp->v_size;
 2367                 kp->ki_vtype =  vp->v_type;
 2368                 kp->ki_vtag =   vp->v_tag;
 2369                 COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data),
 2370                     allowaddr);
 2371         }
 2372 
 2373         /* process information when retrieved via KERN_FILE_BYPID */
 2374         if (ff != NULL) {
 2375                 kp->ki_pid =            pid;
 2376                 kp->ki_fd =             i;
 2377                 kp->ki_ofileflags =     ff->ff_exclose;
 2378                 kp->ki_usecount =       ff->ff_refcnt;
 2379         }
 2380 }

Cache object: f7ba16fa69d63facd64d9ecb21892e9c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.