The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_descrip.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1989, 1991, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)kern_descrip.c      8.6 (Berkeley) 4/19/94
   37  */
   38 
   39 #include <sys/cdefs.h>
   40 __FBSDID("$FreeBSD$");
   41 
   42 #include "opt_capsicum.h"
   43 #include "opt_ddb.h"
   44 #include "opt_ktrace.h"
   45 
   46 #include <sys/param.h>
   47 #include <sys/systm.h>
   48 
   49 #include <sys/capsicum.h>
   50 #include <sys/conf.h>
   51 #include <sys/fcntl.h>
   52 #include <sys/file.h>
   53 #include <sys/filedesc.h>
   54 #include <sys/filio.h>
   55 #include <sys/jail.h>
   56 #include <sys/kernel.h>
   57 #include <sys/limits.h>
   58 #include <sys/lock.h>
   59 #include <sys/malloc.h>
   60 #include <sys/mount.h>
   61 #include <sys/mutex.h>
   62 #include <sys/namei.h>
   63 #include <sys/selinfo.h>
   64 #include <sys/poll.h>
   65 #include <sys/priv.h>
   66 #include <sys/proc.h>
   67 #include <sys/protosw.h>
   68 #include <sys/racct.h>
   69 #include <sys/resourcevar.h>
   70 #include <sys/sbuf.h>
   71 #include <sys/signalvar.h>
   72 #include <sys/kdb.h>
   73 #include <sys/smr.h>
   74 #include <sys/stat.h>
   75 #include <sys/sx.h>
   76 #include <sys/syscallsubr.h>
   77 #include <sys/sysctl.h>
   78 #include <sys/sysproto.h>
   79 #include <sys/unistd.h>
   80 #include <sys/user.h>
   81 #include <sys/vnode.h>
   82 #include <sys/ktrace.h>
   83 
   84 #include <net/vnet.h>
   85 
   86 #include <security/audit/audit.h>
   87 
   88 #include <vm/uma.h>
   89 #include <vm/vm.h>
   90 
   91 #include <ddb/ddb.h>
   92 
   93 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
   94 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
   95 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
   96 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
   97     "file desc to leader structures");
   98 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
   99 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
  100 
  101 MALLOC_DECLARE(M_FADVISE);
  102 
  103 static __read_mostly uma_zone_t file_zone;
  104 static __read_mostly uma_zone_t filedesc0_zone;
  105 __read_mostly uma_zone_t pwd_zone;
  106 VFS_SMR_DECLARE;
  107 
  108 static int      closefp(struct filedesc *fdp, int fd, struct file *fp,
  109                     struct thread *td, bool holdleaders, bool audit);
  110 static void     export_file_to_kinfo(struct file *fp, int fd,
  111                     cap_rights_t *rightsp, struct kinfo_file *kif,
  112                     struct filedesc *fdp, int flags);
  113 static int      fd_first_free(struct filedesc *fdp, int low, int size);
  114 static void     fdgrowtable(struct filedesc *fdp, int nfd);
  115 static void     fdgrowtable_exp(struct filedesc *fdp, int nfd);
  116 static void     fdunused(struct filedesc *fdp, int fd);
  117 static void     fdused(struct filedesc *fdp, int fd);
  118 static int      fget_unlocked_seq(struct filedesc *fdp, int fd,
  119                     cap_rights_t *needrightsp, struct file **fpp, seqc_t *seqp);
  120 static int      getmaxfd(struct thread *td);
  121 static u_long   *filecaps_copy_prep(const struct filecaps *src);
  122 static void     filecaps_copy_finish(const struct filecaps *src,
  123                     struct filecaps *dst, u_long *ioctls);
  124 static u_long   *filecaps_free_prep(struct filecaps *fcaps);
  125 static void     filecaps_free_finish(u_long *ioctls);
  126 
  127 static struct pwd *pwd_alloc(void);
  128 
  129 /*
  130  * Each process has:
  131  *
  132  * - An array of open file descriptors (fd_ofiles)
  133  * - An array of file flags (fd_ofileflags)
  134  * - A bitmap recording which descriptors are in use (fd_map)
  135  *
  136  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  137  * been selected based the historical limit of 20 open files, and an
  138  * assumption that the majority of processes, especially short-lived
  139  * processes like shells, will never need more.
  140  *
  141  * If this initial allocation is exhausted, a larger descriptor table and
  142  * map are allocated dynamically, and the pointers in the process's struct
  143  * filedesc are updated to point to those.  This is repeated every time
  144  * the process runs out of file descriptors (provided it hasn't hit its
  145  * resource limit).
  146  *
  147  * Since threads may hold references to individual descriptor table
  148  * entries, the tables are never freed.  Instead, they are placed on a
  149  * linked list and freed only when the struct filedesc is released.
  150  */
  151 #define NDFILE          20
  152 #define NDSLOTSIZE      sizeof(NDSLOTTYPE)
  153 #define NDENTRIES       (NDSLOTSIZE * __CHAR_BIT)
  154 #define NDSLOT(x)       ((x) / NDENTRIES)
  155 #define NDBIT(x)        ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
  156 #define NDSLOTS(x)      (((x) + NDENTRIES - 1) / NDENTRIES)
  157 
  158 /*
  159  * SLIST entry used to keep track of ofiles which must be reclaimed when
  160  * the process exits.
  161  */
  162 struct freetable {
  163         struct fdescenttbl *ft_table;
  164         SLIST_ENTRY(freetable) ft_next;
  165 };
  166 
  167 /*
  168  * Initial allocation: a filedesc structure + the head of SLIST used to
  169  * keep track of old ofiles + enough space for NDFILE descriptors.
  170  */
  171 
  172 struct fdescenttbl0 {
  173         int     fdt_nfiles;
  174         struct  filedescent fdt_ofiles[NDFILE];
  175 };
  176 
  177 struct filedesc0 {
  178         struct filedesc fd_fd;
  179         SLIST_HEAD(, freetable) fd_free;
  180         struct  fdescenttbl0 fd_dfiles;
  181         NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
  182 };
  183 
  184 /*
  185  * Descriptor management.
  186  */
  187 static int __exclusive_cache_line openfiles; /* actual number of open files */
  188 struct mtx sigio_lock;          /* mtx to protect pointers to sigio */
  189 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
  190 
  191 /*
  192  * If low >= size, just return low. Otherwise find the first zero bit in the
  193  * given bitmap, starting at low and not exceeding size - 1. Return size if
  194  * not found.
  195  */
  196 static int
  197 fd_first_free(struct filedesc *fdp, int low, int size)
  198 {
  199         NDSLOTTYPE *map = fdp->fd_map;
  200         NDSLOTTYPE mask;
  201         int off, maxoff;
  202 
  203         if (low >= size)
  204                 return (low);
  205 
  206         off = NDSLOT(low);
  207         if (low % NDENTRIES) {
  208                 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
  209                 if ((mask &= ~map[off]) != 0UL)
  210                         return (off * NDENTRIES + ffsl(mask) - 1);
  211                 ++off;
  212         }
  213         for (maxoff = NDSLOTS(size); off < maxoff; ++off)
  214                 if (map[off] != ~0UL)
  215                         return (off * NDENTRIES + ffsl(~map[off]) - 1);
  216         return (size);
  217 }
  218 
  219 /*
  220  * Find the last used fd.
  221  *
  222  * Call this variant if fdp can't be modified by anyone else (e.g, during exec).
  223  * Otherwise use fdlastfile.
  224  */
  225 int
  226 fdlastfile_single(struct filedesc *fdp)
  227 {
  228         NDSLOTTYPE *map = fdp->fd_map;
  229         int off, minoff;
  230 
  231         off = NDSLOT(fdp->fd_nfiles - 1);
  232         for (minoff = NDSLOT(0); off >= minoff; --off)
  233                 if (map[off] != 0)
  234                         return (off * NDENTRIES + flsl(map[off]) - 1);
  235         return (-1);
  236 }
  237 
  238 int
  239 fdlastfile(struct filedesc *fdp)
  240 {
  241 
  242         FILEDESC_LOCK_ASSERT(fdp);
  243         return (fdlastfile_single(fdp));
  244 }
  245 
  246 static int
  247 fdisused(struct filedesc *fdp, int fd)
  248 {
  249 
  250         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
  251             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
  252 
  253         return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
  254 }
  255 
  256 /*
  257  * Mark a file descriptor as used.
  258  */
  259 static void
  260 fdused_init(struct filedesc *fdp, int fd)
  261 {
  262 
  263         KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
  264 
  265         fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
  266 }
  267 
  268 static void
  269 fdused(struct filedesc *fdp, int fd)
  270 {
  271 
  272         FILEDESC_XLOCK_ASSERT(fdp);
  273 
  274         fdused_init(fdp, fd);
  275         if (fd == fdp->fd_freefile)
  276                 fdp->fd_freefile++;
  277 }
  278 
  279 /*
  280  * Mark a file descriptor as unused.
  281  */
  282 static void
  283 fdunused(struct filedesc *fdp, int fd)
  284 {
  285 
  286         FILEDESC_XLOCK_ASSERT(fdp);
  287 
  288         KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
  289         KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
  290             ("fd=%d is still in use", fd));
  291 
  292         fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
  293         if (fd < fdp->fd_freefile)
  294                 fdp->fd_freefile = fd;
  295 }
  296 
  297 /*
  298  * Free a file descriptor.
  299  *
  300  * Avoid some work if fdp is about to be destroyed.
  301  */
  302 static inline void
  303 fdefree_last(struct filedescent *fde)
  304 {
  305 
  306         filecaps_free(&fde->fde_caps);
  307 }
  308 
  309 static inline void
  310 fdfree(struct filedesc *fdp, int fd)
  311 {
  312         struct filedescent *fde;
  313 
  314         FILEDESC_XLOCK_ASSERT(fdp);
  315         fde = &fdp->fd_ofiles[fd];
  316 #ifdef CAPABILITIES
  317         seqc_write_begin(&fde->fde_seqc);
  318 #endif
  319         fde->fde_file = NULL;
  320 #ifdef CAPABILITIES
  321         seqc_write_end(&fde->fde_seqc);
  322 #endif
  323         fdefree_last(fde);
  324         fdunused(fdp, fd);
  325 }
  326 
  327 /*
  328  * System calls on descriptors.
  329  */
  330 #ifndef _SYS_SYSPROTO_H_
  331 struct getdtablesize_args {
  332         int     dummy;
  333 };
  334 #endif
  335 /* ARGSUSED */
  336 int
  337 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
  338 {
  339 #ifdef  RACCT
  340         uint64_t lim;
  341 #endif
  342 
  343         td->td_retval[0] = getmaxfd(td);
  344 #ifdef  RACCT
  345         PROC_LOCK(td->td_proc);
  346         lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
  347         PROC_UNLOCK(td->td_proc);
  348         if (lim < td->td_retval[0])
  349                 td->td_retval[0] = lim;
  350 #endif
  351         return (0);
  352 }
  353 
  354 /*
  355  * Duplicate a file descriptor to a particular value.
  356  *
  357  * Note: keep in mind that a potential race condition exists when closing
  358  * descriptors from a shared descriptor table (via rfork).
  359  */
  360 #ifndef _SYS_SYSPROTO_H_
  361 struct dup2_args {
  362         u_int   from;
  363         u_int   to;
  364 };
  365 #endif
  366 /* ARGSUSED */
  367 int
  368 sys_dup2(struct thread *td, struct dup2_args *uap)
  369 {
  370 
  371         return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
  372 }
  373 
  374 /*
  375  * Duplicate a file descriptor.
  376  */
  377 #ifndef _SYS_SYSPROTO_H_
  378 struct dup_args {
  379         u_int   fd;
  380 };
  381 #endif
  382 /* ARGSUSED */
  383 int
  384 sys_dup(struct thread *td, struct dup_args *uap)
  385 {
  386 
  387         return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
  388 }
  389 
  390 /*
  391  * The file control system call.
  392  */
  393 #ifndef _SYS_SYSPROTO_H_
  394 struct fcntl_args {
  395         int     fd;
  396         int     cmd;
  397         long    arg;
  398 };
  399 #endif
  400 /* ARGSUSED */
  401 int
  402 sys_fcntl(struct thread *td, struct fcntl_args *uap)
  403 {
  404 
  405         return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
  406 }
  407 
  408 int
  409 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
  410 {
  411         struct flock fl;
  412         struct __oflock ofl;
  413         intptr_t arg1;
  414         int error, newcmd;
  415 
  416         error = 0;
  417         newcmd = cmd;
  418         switch (cmd) {
  419         case F_OGETLK:
  420         case F_OSETLK:
  421         case F_OSETLKW:
  422                 /*
  423                  * Convert old flock structure to new.
  424                  */
  425                 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
  426                 fl.l_start = ofl.l_start;
  427                 fl.l_len = ofl.l_len;
  428                 fl.l_pid = ofl.l_pid;
  429                 fl.l_type = ofl.l_type;
  430                 fl.l_whence = ofl.l_whence;
  431                 fl.l_sysid = 0;
  432 
  433                 switch (cmd) {
  434                 case F_OGETLK:
  435                         newcmd = F_GETLK;
  436                         break;
  437                 case F_OSETLK:
  438                         newcmd = F_SETLK;
  439                         break;
  440                 case F_OSETLKW:
  441                         newcmd = F_SETLKW;
  442                         break;
  443                 }
  444                 arg1 = (intptr_t)&fl;
  445                 break;
  446         case F_GETLK:
  447         case F_SETLK:
  448         case F_SETLKW:
  449         case F_SETLK_REMOTE:
  450                 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
  451                 arg1 = (intptr_t)&fl;
  452                 break;
  453         default:
  454                 arg1 = arg;
  455                 break;
  456         }
  457         if (error)
  458                 return (error);
  459         error = kern_fcntl(td, fd, newcmd, arg1);
  460         if (error)
  461                 return (error);
  462         if (cmd == F_OGETLK) {
  463                 ofl.l_start = fl.l_start;
  464                 ofl.l_len = fl.l_len;
  465                 ofl.l_pid = fl.l_pid;
  466                 ofl.l_type = fl.l_type;
  467                 ofl.l_whence = fl.l_whence;
  468                 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
  469         } else if (cmd == F_GETLK) {
  470                 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
  471         }
  472         return (error);
  473 }
  474 
  475 int
  476 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
  477 {
  478         struct filedesc *fdp;
  479         struct flock *flp;
  480         struct file *fp, *fp2;
  481         struct filedescent *fde;
  482         struct proc *p;
  483         struct vnode *vp;
  484         struct mount *mp;
  485         struct kinfo_file *kif;
  486         int error, flg, kif_sz, seals, tmp;
  487         uint64_t bsize;
  488         off_t foffset;
  489 
  490         error = 0;
  491         flg = F_POSIX;
  492         p = td->td_proc;
  493         fdp = p->p_fd;
  494 
  495         AUDIT_ARG_FD(cmd);
  496         AUDIT_ARG_CMD(cmd);
  497         switch (cmd) {
  498         case F_DUPFD:
  499                 tmp = arg;
  500                 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
  501                 break;
  502 
  503         case F_DUPFD_CLOEXEC:
  504                 tmp = arg;
  505                 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
  506                 break;
  507 
  508         case F_DUP2FD:
  509                 tmp = arg;
  510                 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
  511                 break;
  512 
  513         case F_DUP2FD_CLOEXEC:
  514                 tmp = arg;
  515                 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
  516                 break;
  517 
  518         case F_GETFD:
  519                 error = EBADF;
  520                 FILEDESC_SLOCK(fdp);
  521                 fde = fdeget_locked(fdp, fd);
  522                 if (fde != NULL) {
  523                         td->td_retval[0] =
  524                             (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
  525                         error = 0;
  526                 }
  527                 FILEDESC_SUNLOCK(fdp);
  528                 break;
  529 
  530         case F_SETFD:
  531                 error = EBADF;
  532                 FILEDESC_XLOCK(fdp);
  533                 fde = fdeget_locked(fdp, fd);
  534                 if (fde != NULL) {
  535                         fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
  536                             (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
  537                         error = 0;
  538                 }
  539                 FILEDESC_XUNLOCK(fdp);
  540                 break;
  541 
  542         case F_GETFL:
  543                 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
  544                 if (error != 0)
  545                         break;
  546                 td->td_retval[0] = OFLAGS(fp->f_flag);
  547                 fdrop(fp, td);
  548                 break;
  549 
  550         case F_SETFL:
  551                 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
  552                 if (error != 0)
  553                         break;
  554                 if (fp->f_ops == &path_fileops) {
  555                         fdrop(fp, td);
  556                         error = EBADF;
  557                         break;
  558                 }
  559                 do {
  560                         tmp = flg = fp->f_flag;
  561                         tmp &= ~FCNTLFLAGS;
  562                         tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
  563                 } while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
  564                 tmp = fp->f_flag & FNONBLOCK;
  565                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  566                 if (error != 0) {
  567                         fdrop(fp, td);
  568                         break;
  569                 }
  570                 tmp = fp->f_flag & FASYNC;
  571                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  572                 if (error == 0) {
  573                         fdrop(fp, td);
  574                         break;
  575                 }
  576                 atomic_clear_int(&fp->f_flag, FNONBLOCK);
  577                 tmp = 0;
  578                 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  579                 fdrop(fp, td);
  580                 break;
  581 
  582         case F_GETOWN:
  583                 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
  584                 if (error != 0)
  585                         break;
  586                 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
  587                 if (error == 0)
  588                         td->td_retval[0] = tmp;
  589                 fdrop(fp, td);
  590                 break;
  591 
  592         case F_SETOWN:
  593                 error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
  594                 if (error != 0)
  595                         break;
  596                 tmp = arg;
  597                 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
  598                 fdrop(fp, td);
  599                 break;
  600 
  601         case F_SETLK_REMOTE:
  602                 error = priv_check(td, PRIV_NFS_LOCKD);
  603                 if (error != 0)
  604                         return (error);
  605                 flg = F_REMOTE;
  606                 goto do_setlk;
  607 
  608         case F_SETLKW:
  609                 flg |= F_WAIT;
  610                 /* FALLTHROUGH F_SETLK */
  611 
  612         case F_SETLK:
  613         do_setlk:
  614                 flp = (struct flock *)arg;
  615                 if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
  616                         error = EINVAL;
  617                         break;
  618                 }
  619 
  620                 error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
  621                 if (error != 0)
  622                         break;
  623                 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
  624                         error = EBADF;
  625                         fdrop(fp, td);
  626                         break;
  627                 }
  628 
  629                 if (flp->l_whence == SEEK_CUR) {
  630                         foffset = foffset_get(fp);
  631                         if (foffset < 0 ||
  632                             (flp->l_start > 0 &&
  633                              foffset > OFF_MAX - flp->l_start)) {
  634                                 error = EOVERFLOW;
  635                                 fdrop(fp, td);
  636                                 break;
  637                         }
  638                         flp->l_start += foffset;
  639                 }
  640 
  641                 vp = fp->f_vnode;
  642                 switch (flp->l_type) {
  643                 case F_RDLCK:
  644                         if ((fp->f_flag & FREAD) == 0) {
  645                                 error = EBADF;
  646                                 break;
  647                         }
  648                         if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
  649                                 PROC_LOCK(p->p_leader);
  650                                 p->p_leader->p_flag |= P_ADVLOCK;
  651                                 PROC_UNLOCK(p->p_leader);
  652                         }
  653                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
  654                             flp, flg);
  655                         break;
  656                 case F_WRLCK:
  657                         if ((fp->f_flag & FWRITE) == 0) {
  658                                 error = EBADF;
  659                                 break;
  660                         }
  661                         if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
  662                                 PROC_LOCK(p->p_leader);
  663                                 p->p_leader->p_flag |= P_ADVLOCK;
  664                                 PROC_UNLOCK(p->p_leader);
  665                         }
  666                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
  667                             flp, flg);
  668                         break;
  669                 case F_UNLCK:
  670                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
  671                             flp, flg);
  672                         break;
  673                 case F_UNLCKSYS:
  674                         if (flg != F_REMOTE) {
  675                                 error = EINVAL;
  676                                 break;
  677                         }
  678                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
  679                             F_UNLCKSYS, flp, flg);
  680                         break;
  681                 default:
  682                         error = EINVAL;
  683                         break;
  684                 }
  685                 if (error != 0 || flp->l_type == F_UNLCK ||
  686                     flp->l_type == F_UNLCKSYS) {
  687                         fdrop(fp, td);
  688                         break;
  689                 }
  690 
  691                 /*
  692                  * Check for a race with close.
  693                  *
  694                  * The vnode is now advisory locked (or unlocked, but this case
  695                  * is not really important) as the caller requested.
  696                  * We had to drop the filedesc lock, so we need to recheck if
  697                  * the descriptor is still valid, because if it was closed
  698                  * in the meantime we need to remove advisory lock from the
  699                  * vnode - close on any descriptor leading to an advisory
  700                  * locked vnode, removes that lock.
  701                  * We will return 0 on purpose in that case, as the result of
  702                  * successful advisory lock might have been externally visible
  703                  * already. This is fine - effectively we pretend to the caller
  704                  * that the closing thread was a bit slower and that the
  705                  * advisory lock succeeded before the close.
  706                  */
  707                 error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
  708                 if (error != 0) {
  709                         fdrop(fp, td);
  710                         break;
  711                 }
  712                 if (fp != fp2) {
  713                         flp->l_whence = SEEK_SET;
  714                         flp->l_start = 0;
  715                         flp->l_len = 0;
  716                         flp->l_type = F_UNLCK;
  717                         (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
  718                             F_UNLCK, flp, F_POSIX);
  719                 }
  720                 fdrop(fp, td);
  721                 fdrop(fp2, td);
  722                 break;
  723 
  724         case F_GETLK:
  725                 error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
  726                 if (error != 0)
  727                         break;
  728                 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
  729                         error = EBADF;
  730                         fdrop(fp, td);
  731                         break;
  732                 }
  733                 flp = (struct flock *)arg;
  734                 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
  735                     flp->l_type != F_UNLCK) {
  736                         error = EINVAL;
  737                         fdrop(fp, td);
  738                         break;
  739                 }
  740                 if (flp->l_whence == SEEK_CUR) {
  741                         foffset = foffset_get(fp);
  742                         if ((flp->l_start > 0 &&
  743                             foffset > OFF_MAX - flp->l_start) ||
  744                             (flp->l_start < 0 &&
  745                             foffset < OFF_MIN - flp->l_start)) {
  746                                 error = EOVERFLOW;
  747                                 fdrop(fp, td);
  748                                 break;
  749                         }
  750                         flp->l_start += foffset;
  751                 }
  752                 vp = fp->f_vnode;
  753                 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
  754                     F_POSIX);
  755                 fdrop(fp, td);
  756                 break;
  757 
  758         case F_ADD_SEALS:
  759                 error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
  760                 if (error != 0)
  761                         break;
  762                 error = fo_add_seals(fp, arg);
  763                 fdrop(fp, td);
  764                 break;
  765 
  766         case F_GET_SEALS:
  767                 error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
  768                 if (error != 0)
  769                         break;
  770                 if (fo_get_seals(fp, &seals) == 0)
  771                         td->td_retval[0] = seals;
  772                 else
  773                         error = EINVAL;
  774                 fdrop(fp, td);
  775                 break;
  776 
  777         case F_RDAHEAD:
  778                 arg = arg ? 128 * 1024: 0;
  779                 /* FALLTHROUGH */
  780         case F_READAHEAD:
  781                 error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
  782                 if (error != 0)
  783                         break;
  784                 if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
  785                         fdrop(fp, td);
  786                         error = EBADF;
  787                         break;
  788                 }
  789                 vp = fp->f_vnode;
  790                 if (vp->v_type != VREG) {
  791                         fdrop(fp, td);
  792                         error = ENOTTY;
  793                         break;
  794                 }
  795 
  796                 /*
  797                  * Exclusive lock synchronizes against f_seqcount reads and
  798                  * writes in sequential_heuristic().
  799                  */
  800                 error = vn_lock(vp, LK_EXCLUSIVE);
  801                 if (error != 0) {
  802                         fdrop(fp, td);
  803                         break;
  804                 }
  805                 if (arg >= 0) {
  806                         bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
  807                         arg = MIN(arg, INT_MAX - bsize + 1);
  808                         fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
  809                             (arg + bsize - 1) / bsize);
  810                         atomic_set_int(&fp->f_flag, FRDAHEAD);
  811                 } else {
  812                         atomic_clear_int(&fp->f_flag, FRDAHEAD);
  813                 }
  814                 VOP_UNLOCK(vp);
  815                 fdrop(fp, td);
  816                 break;
  817 
  818         case F_ISUNIONSTACK:
  819                 /*
  820                  * Check if the vnode is part of a union stack (either the
  821                  * "union" flag from mount(2) or unionfs).
  822                  *
  823                  * Prior to introduction of this op libc's readdir would call
  824                  * fstatfs(2), in effect unnecessarily copying kilobytes of
  825                  * data just to check fs name and a mount flag.
  826                  *
  827                  * Fixing the code to handle everything in the kernel instead
  828                  * is a non-trivial endeavor and has low priority, thus this
  829                  * horrible kludge facilitates the current behavior in a much
  830                  * cheaper manner until someone(tm) sorts this out.
  831                  */
  832                 error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
  833                 if (error != 0)
  834                         break;
  835                 if (fp->f_type != DTYPE_VNODE) {
  836                         fdrop(fp, td);
  837                         error = EBADF;
  838                         break;
  839                 }
  840                 vp = fp->f_vnode;
  841                 /*
  842                  * Since we don't prevent dooming the vnode even non-null mp
  843                  * found can become immediately stale. This is tolerable since
  844                  * mount points are type-stable (providing safe memory access)
  845                  * and any vfs op on this vnode going forward will return an
  846                  * error (meaning return value in this case is meaningless).
  847                  */
  848                 mp = atomic_load_ptr(&vp->v_mount);
  849                 if (__predict_false(mp == NULL)) {
  850                         fdrop(fp, td);
  851                         error = EBADF;
  852                         break;
  853                 }
  854                 td->td_retval[0] = 0;
  855                 if (mp->mnt_kern_flag & MNTK_UNIONFS ||
  856                     mp->mnt_flag & MNT_UNION)
  857                         td->td_retval[0] = 1;
  858                 fdrop(fp, td);
  859                 break;
  860 
  861         case F_KINFO:
  862 #ifdef CAPABILITY_MODE
  863                 if (IN_CAPABILITY_MODE(td)) {
  864                         error = ECAPMODE;
  865                         break;
  866                 }
  867 #endif
  868                 error = copyin((void *)arg, &kif_sz, sizeof(kif_sz));
  869                 if (error != 0)
  870                         break;
  871                 if (kif_sz != sizeof(*kif)) {
  872                         error = EINVAL;
  873                         break;
  874                 }
  875                 kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK | M_ZERO);
  876                 FILEDESC_SLOCK(fdp);
  877                 error = fget_cap_locked(fdp, fd, &cap_fcntl_rights, &fp, NULL);
  878                 if (error == 0 && fhold(fp)) {
  879                         export_file_to_kinfo(fp, fd, NULL, kif, fdp, 0);
  880                         FILEDESC_SUNLOCK(fdp);
  881                         fdrop(fp, td);
  882                         if ((kif->kf_status & KF_ATTR_VALID) != 0) {
  883                                 kif->kf_structsize = sizeof(*kif);
  884                                 error = copyout(kif, (void *)arg, sizeof(*kif));
  885                         } else {
  886                                 error = EBADF;
  887                         }
  888                 } else {
  889                         FILEDESC_SUNLOCK(fdp);
  890                         if (error == 0)
  891                                 error = EBADF;
  892                 }
  893                 free(kif, M_TEMP);
  894                 break;
  895 
  896         default:
  897                 error = EINVAL;
  898                 break;
  899         }
  900         return (error);
  901 }
  902 
  903 static int
  904 getmaxfd(struct thread *td)
  905 {
  906 
  907         return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
  908 }
  909 
  910 /*
  911  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  912  */
  913 int
  914 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
  915 {
  916         struct filedesc *fdp;
  917         struct filedescent *oldfde, *newfde;
  918         struct proc *p;
  919         struct file *delfp, *oldfp;
  920         u_long *oioctls, *nioctls;
  921         int error, maxfd;
  922 
  923         p = td->td_proc;
  924         fdp = p->p_fd;
  925         oioctls = NULL;
  926 
  927         MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
  928         MPASS(mode < FDDUP_LASTMODE);
  929 
  930         AUDIT_ARG_FD(old);
  931         /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
  932 
  933         /*
  934          * Verify we have a valid descriptor to dup from and possibly to
  935          * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
  936          * return EINVAL when the new descriptor is out of bounds.
  937          */
  938         if (old < 0)
  939                 return (EBADF);
  940         if (new < 0)
  941                 return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
  942         maxfd = getmaxfd(td);
  943         if (new >= maxfd)
  944                 return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
  945 
  946         error = EBADF;
  947         FILEDESC_XLOCK(fdp);
  948         if (fget_locked(fdp, old) == NULL)
  949                 goto unlock;
  950         if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
  951                 td->td_retval[0] = new;
  952                 if (flags & FDDUP_FLAG_CLOEXEC)
  953                         fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
  954                 error = 0;
  955                 goto unlock;
  956         }
  957 
  958         oldfde = &fdp->fd_ofiles[old];
  959         oldfp = oldfde->fde_file;
  960         if (!fhold(oldfp))
  961                 goto unlock;
  962 
  963         /*
  964          * If the caller specified a file descriptor, make sure the file
  965          * table is large enough to hold it, and grab it.  Otherwise, just
  966          * allocate a new descriptor the usual way.
  967          */
  968         switch (mode) {
  969         case FDDUP_NORMAL:
  970         case FDDUP_FCNTL:
  971                 if ((error = fdalloc(td, new, &new)) != 0) {
  972                         fdrop(oldfp, td);
  973                         goto unlock;
  974                 }
  975                 break;
  976         case FDDUP_MUSTREPLACE:
  977                 /* Target file descriptor must exist. */
  978                 if (fget_locked(fdp, new) == NULL) {
  979                         fdrop(oldfp, td);
  980                         goto unlock;
  981                 }
  982                 break;
  983         case FDDUP_FIXED:
  984                 if (new >= fdp->fd_nfiles) {
  985                         /*
  986                          * The resource limits are here instead of e.g.
  987                          * fdalloc(), because the file descriptor table may be
  988                          * shared between processes, so we can't really use
  989                          * racct_add()/racct_sub().  Instead of counting the
  990                          * number of actually allocated descriptors, just put
  991                          * the limit on the size of the file descriptor table.
  992                          */
  993 #ifdef RACCT
  994                         if (RACCT_ENABLED()) {
  995                                 error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
  996                                 if (error != 0) {
  997                                         error = EMFILE;
  998                                         fdrop(oldfp, td);
  999                                         goto unlock;
 1000                                 }
 1001                         }
 1002 #endif
 1003                         fdgrowtable_exp(fdp, new + 1);
 1004                 }
 1005                 if (!fdisused(fdp, new))
 1006                         fdused(fdp, new);
 1007                 break;
 1008         default:
 1009                 KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 1010         }
 1011 
 1012         KASSERT(old != new, ("new fd is same as old"));
 1013 
 1014         /* Refetch oldfde because the table may have grown and old one freed. */
 1015         oldfde = &fdp->fd_ofiles[old];
 1016         KASSERT(oldfp == oldfde->fde_file,
 1017             ("fdt_ofiles shift from growth observed at fd %d",
 1018             old));
 1019 
 1020         newfde = &fdp->fd_ofiles[new];
 1021         delfp = newfde->fde_file;
 1022 
 1023         nioctls = filecaps_copy_prep(&oldfde->fde_caps);
 1024 
 1025         /*
 1026          * Duplicate the source descriptor.
 1027          */
 1028 #ifdef CAPABILITIES
 1029         seqc_write_begin(&newfde->fde_seqc);
 1030 #endif
 1031         oioctls = filecaps_free_prep(&newfde->fde_caps);
 1032         memcpy(newfde, oldfde, fde_change_size);
 1033         filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 1034             nioctls);
 1035         if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 1036                 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 1037         else
 1038                 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 1039 #ifdef CAPABILITIES
 1040         seqc_write_end(&newfde->fde_seqc);
 1041 #endif
 1042         td->td_retval[0] = new;
 1043 
 1044         error = 0;
 1045 
 1046         if (delfp != NULL) {
 1047                 (void) closefp(fdp, new, delfp, td, true, false);
 1048                 FILEDESC_UNLOCK_ASSERT(fdp);
 1049         } else {
 1050 unlock:
 1051                 FILEDESC_XUNLOCK(fdp);
 1052         }
 1053 
 1054         filecaps_free_finish(oioctls);
 1055         return (error);
 1056 }
 1057 
 1058 static void
 1059 sigiofree(struct sigio *sigio)
 1060 {
 1061         crfree(sigio->sio_ucred);
 1062         free(sigio, M_SIGIO);
 1063 }
 1064 
 1065 static struct sigio *
 1066 funsetown_locked(struct sigio *sigio)
 1067 {
 1068         struct proc *p;
 1069         struct pgrp *pg;
 1070 
 1071         SIGIO_ASSERT_LOCKED();
 1072 
 1073         if (sigio == NULL)
 1074                 return (NULL);
 1075         *sigio->sio_myref = NULL;
 1076         if (sigio->sio_pgid < 0) {
 1077                 pg = sigio->sio_pgrp;
 1078                 PGRP_LOCK(pg);
 1079                 SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio);
 1080                 PGRP_UNLOCK(pg);
 1081         } else {
 1082                 p = sigio->sio_proc;
 1083                 PROC_LOCK(p);
 1084                 SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio);
 1085                 PROC_UNLOCK(p);
 1086         }
 1087         return (sigio);
 1088 }
 1089 
 1090 /*
 1091  * If sigio is on the list associated with a process or process group,
 1092  * disable signalling from the device, remove sigio from the list and
 1093  * free sigio.
 1094  */
 1095 void
 1096 funsetown(struct sigio **sigiop)
 1097 {
 1098         struct sigio *sigio;
 1099 
 1100         /* Racy check, consumers must provide synchronization. */
 1101         if (*sigiop == NULL)
 1102                 return;
 1103 
 1104         SIGIO_LOCK();
 1105         sigio = funsetown_locked(*sigiop);
 1106         SIGIO_UNLOCK();
 1107         if (sigio != NULL)
 1108                 sigiofree(sigio);
 1109 }
 1110 
 1111 /*
 1112  * Free a list of sigio structures.  The caller must ensure that new sigio
 1113  * structures cannot be added after this point.  For process groups this is
 1114  * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
 1115  * as an interlock.
 1116  */
 1117 void
 1118 funsetownlst(struct sigiolst *sigiolst)
 1119 {
 1120         struct proc *p;
 1121         struct pgrp *pg;
 1122         struct sigio *sigio, *tmp;
 1123 
 1124         /* Racy check. */
 1125         sigio = SLIST_FIRST(sigiolst);
 1126         if (sigio == NULL)
 1127                 return;
 1128 
 1129         p = NULL;
 1130         pg = NULL;
 1131 
 1132         SIGIO_LOCK();
 1133         sigio = SLIST_FIRST(sigiolst);
 1134         if (sigio == NULL) {
 1135                 SIGIO_UNLOCK();
 1136                 return;
 1137         }
 1138 
 1139         /*
 1140          * Every entry of the list should belong to a single proc or pgrp.
 1141          */
 1142         if (sigio->sio_pgid < 0) {
 1143                 pg = sigio->sio_pgrp;
 1144                 sx_assert(&proctree_lock, SX_XLOCKED);
 1145                 PGRP_LOCK(pg);
 1146         } else /* if (sigio->sio_pgid > 0) */ {
 1147                 p = sigio->sio_proc;
 1148                 PROC_LOCK(p);
 1149                 KASSERT((p->p_flag & P_WEXIT) != 0,
 1150                     ("%s: process %p is not exiting", __func__, p));
 1151         }
 1152 
 1153         SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
 1154                 *sigio->sio_myref = NULL;
 1155                 if (pg != NULL) {
 1156                         KASSERT(sigio->sio_pgid < 0,
 1157                             ("Proc sigio in pgrp sigio list"));
 1158                         KASSERT(sigio->sio_pgrp == pg,
 1159                             ("Bogus pgrp in sigio list"));
 1160                 } else /* if (p != NULL) */ {
 1161                         KASSERT(sigio->sio_pgid > 0,
 1162                             ("Pgrp sigio in proc sigio list"));
 1163                         KASSERT(sigio->sio_proc == p,
 1164                             ("Bogus proc in sigio list"));
 1165                 }
 1166         }
 1167 
 1168         if (pg != NULL)
 1169                 PGRP_UNLOCK(pg);
 1170         else
 1171                 PROC_UNLOCK(p);
 1172         SIGIO_UNLOCK();
 1173 
 1174         SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
 1175                 sigiofree(sigio);
 1176 }
 1177 
 1178 /*
 1179  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
 1180  *
 1181  * After permission checking, add a sigio structure to the sigio list for
 1182  * the process or process group.
 1183  */
 1184 int
 1185 fsetown(pid_t pgid, struct sigio **sigiop)
 1186 {
 1187         struct proc *proc;
 1188         struct pgrp *pgrp;
 1189         struct sigio *osigio, *sigio;
 1190         int ret;
 1191 
 1192         if (pgid == 0) {
 1193                 funsetown(sigiop);
 1194                 return (0);
 1195         }
 1196 
 1197         sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 1198         sigio->sio_pgid = pgid;
 1199         sigio->sio_ucred = crhold(curthread->td_ucred);
 1200         sigio->sio_myref = sigiop;
 1201 
 1202         ret = 0;
 1203         if (pgid > 0) {
 1204                 ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc);
 1205                 SIGIO_LOCK();
 1206                 osigio = funsetown_locked(*sigiop);
 1207                 if (ret == 0) {
 1208                         PROC_LOCK(proc);
 1209                         _PRELE(proc);
 1210                         if ((proc->p_flag & P_WEXIT) != 0) {
 1211                                 ret = ESRCH;
 1212                         } else if (proc->p_session !=
 1213                             curthread->td_proc->p_session) {
 1214                                 /*
 1215                                  * Policy - Don't allow a process to FSETOWN a
 1216                                  * process in another session.
 1217                                  *
 1218                                  * Remove this test to allow maximum flexibility
 1219                                  * or restrict FSETOWN to the current process or
 1220                                  * process group for maximum safety.
 1221                                  */
 1222                                 ret = EPERM;
 1223                         } else {
 1224                                 sigio->sio_proc = proc;
 1225                                 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio,
 1226                                     sio_pgsigio);
 1227                         }
 1228                         PROC_UNLOCK(proc);
 1229                 }
 1230         } else /* if (pgid < 0) */ {
 1231                 sx_slock(&proctree_lock);
 1232                 SIGIO_LOCK();
 1233                 osigio = funsetown_locked(*sigiop);
 1234                 pgrp = pgfind(-pgid);
 1235                 if (pgrp == NULL) {
 1236                         ret = ESRCH;
 1237                 } else {
 1238                         if (pgrp->pg_session != curthread->td_proc->p_session) {
 1239                                 /*
 1240                                  * Policy - Don't allow a process to FSETOWN a
 1241                                  * process in another session.
 1242                                  *
 1243                                  * Remove this test to allow maximum flexibility
 1244                                  * or restrict FSETOWN to the current process or
 1245                                  * process group for maximum safety.
 1246                                  */
 1247                                 ret = EPERM;
 1248                         } else {
 1249                                 sigio->sio_pgrp = pgrp;
 1250                                 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio,
 1251                                     sio_pgsigio);
 1252                         }
 1253                         PGRP_UNLOCK(pgrp);
 1254                 }
 1255                 sx_sunlock(&proctree_lock);
 1256         }
 1257         if (ret == 0)
 1258                 *sigiop = sigio;
 1259         SIGIO_UNLOCK();
 1260         if (osigio != NULL)
 1261                 sigiofree(osigio);
 1262         return (ret);
 1263 }
 1264 
 1265 /*
 1266  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
 1267  */
 1268 pid_t
 1269 fgetown(struct sigio **sigiop)
 1270 {
 1271         pid_t pgid;
 1272 
 1273         SIGIO_LOCK();
 1274         pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 1275         SIGIO_UNLOCK();
 1276         return (pgid);
 1277 }
 1278 
 1279 static int
 1280 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
 1281     bool audit)
 1282 {
 1283         int error;
 1284 
 1285         FILEDESC_XLOCK_ASSERT(fdp);
 1286 
 1287         /*
 1288          * We now hold the fp reference that used to be owned by the
 1289          * descriptor array.  We have to unlock the FILEDESC *AFTER*
 1290          * knote_fdclose to prevent a race of the fd getting opened, a knote
 1291          * added, and deleteing a knote for the new fd.
 1292          */
 1293         if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
 1294                 knote_fdclose(td, fd);
 1295 
 1296         /*
 1297          * We need to notify mqueue if the object is of type mqueue.
 1298          */
 1299         if (__predict_false(fp->f_type == DTYPE_MQUEUE))
 1300                 mq_fdclose(td, fd, fp);
 1301         FILEDESC_XUNLOCK(fdp);
 1302 
 1303 #ifdef AUDIT
 1304         if (AUDITING_TD(td) && audit)
 1305                 audit_sysclose(td, fd, fp);
 1306 #endif
 1307         error = closef(fp, td);
 1308 
 1309         /*
 1310          * All paths leading up to closefp() will have already removed or
 1311          * replaced the fd in the filedesc table, so a restart would not
 1312          * operate on the same file.
 1313          */
 1314         if (error == ERESTART)
 1315                 error = EINTR;
 1316 
 1317         return (error);
 1318 }
 1319 
 1320 static int
 1321 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
 1322     bool holdleaders, bool audit)
 1323 {
 1324         int error;
 1325 
 1326         FILEDESC_XLOCK_ASSERT(fdp);
 1327 
 1328         if (holdleaders) {
 1329                 if (td->td_proc->p_fdtol != NULL) {
 1330                         /*
 1331                          * Ask fdfree() to sleep to ensure that all relevant
 1332                          * process leaders can be traversed in closef().
 1333                          */
 1334                         fdp->fd_holdleaderscount++;
 1335                 } else {
 1336                         holdleaders = false;
 1337                 }
 1338         }
 1339 
 1340         error = closefp_impl(fdp, fd, fp, td, audit);
 1341         if (holdleaders) {
 1342                 FILEDESC_XLOCK(fdp);
 1343                 fdp->fd_holdleaderscount--;
 1344                 if (fdp->fd_holdleaderscount == 0 &&
 1345                     fdp->fd_holdleaderswakeup != 0) {
 1346                         fdp->fd_holdleaderswakeup = 0;
 1347                         wakeup(&fdp->fd_holdleaderscount);
 1348                 }
 1349                 FILEDESC_XUNLOCK(fdp);
 1350         }
 1351         return (error);
 1352 }
 1353 
 1354 static int
 1355 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
 1356     bool holdleaders, bool audit)
 1357 {
 1358 
 1359         FILEDESC_XLOCK_ASSERT(fdp);
 1360 
 1361         if (__predict_false(td->td_proc->p_fdtol != NULL)) {
 1362                 return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
 1363         } else {
 1364                 return (closefp_impl(fdp, fd, fp, td, audit));
 1365         }
 1366 }
 1367 
 1368 /*
 1369  * Close a file descriptor.
 1370  */
 1371 #ifndef _SYS_SYSPROTO_H_
 1372 struct close_args {
 1373         int     fd;
 1374 };
 1375 #endif
 1376 /* ARGSUSED */
 1377 int
 1378 sys_close(struct thread *td, struct close_args *uap)
 1379 {
 1380 
 1381         return (kern_close(td, uap->fd));
 1382 }
 1383 
 1384 int
 1385 kern_close(struct thread *td, int fd)
 1386 {
 1387         struct filedesc *fdp;
 1388         struct file *fp;
 1389 
 1390         fdp = td->td_proc->p_fd;
 1391 
 1392         FILEDESC_XLOCK(fdp);
 1393         if ((fp = fget_locked(fdp, fd)) == NULL) {
 1394                 FILEDESC_XUNLOCK(fdp);
 1395                 return (EBADF);
 1396         }
 1397         fdfree(fdp, fd);
 1398 
 1399         /* closefp() drops the FILEDESC lock for us. */
 1400         return (closefp(fdp, fd, fp, td, true, true));
 1401 }
 1402 
 1403 static int
 1404 close_range_cloexec(struct thread *td, u_int lowfd, u_int highfd)
 1405 {
 1406         struct filedesc *fdp;
 1407         struct fdescenttbl *fdt;
 1408         struct filedescent *fde;
 1409         int fd;
 1410 
 1411         fdp = td->td_proc->p_fd;
 1412         FILEDESC_XLOCK(fdp);
 1413         fdt = atomic_load_ptr(&fdp->fd_files);
 1414         highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 1415         fd = lowfd;
 1416         if (__predict_false(fd > highfd)) {
 1417                 goto out_locked;
 1418         }
 1419         for (; fd <= highfd; fd++) {
 1420                 fde = &fdt->fdt_ofiles[fd];
 1421                 if (fde->fde_file != NULL)
 1422                         fde->fde_flags |= UF_EXCLOSE;
 1423         }
 1424 out_locked:
 1425         FILEDESC_XUNLOCK(fdp);
 1426         return (0);
 1427 }
 1428 
 1429 static int
 1430 close_range_impl(struct thread *td, u_int lowfd, u_int highfd)
 1431 {
 1432         struct filedesc *fdp;
 1433         const struct fdescenttbl *fdt;
 1434         struct file *fp;
 1435         int fd;
 1436 
 1437         fdp = td->td_proc->p_fd;
 1438         FILEDESC_XLOCK(fdp);
 1439         fdt = atomic_load_ptr(&fdp->fd_files);
 1440         highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 1441         fd = lowfd;
 1442         if (__predict_false(fd > highfd)) {
 1443                 goto out_locked;
 1444         }
 1445         for (;;) {
 1446                 fp = fdt->fdt_ofiles[fd].fde_file;
 1447                 if (fp == NULL) {
 1448                         if (fd == highfd)
 1449                                 goto out_locked;
 1450                 } else {
 1451                         fdfree(fdp, fd);
 1452                         (void) closefp(fdp, fd, fp, td, true, true);
 1453                         if (fd == highfd)
 1454                                 goto out_unlocked;
 1455                         FILEDESC_XLOCK(fdp);
 1456                         fdt = atomic_load_ptr(&fdp->fd_files);
 1457                 }
 1458                 fd++;
 1459         }
 1460 out_locked:
 1461         FILEDESC_XUNLOCK(fdp);
 1462 out_unlocked:
 1463         return (0);
 1464 }
 1465 
 1466 int
 1467 kern_close_range(struct thread *td, int flags, u_int lowfd, u_int highfd)
 1468 {
 1469 
 1470         /*
 1471          * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
 1472          * open should not be a usage error.  From a close_range() perspective,
 1473          * close_range(3, ~0U, 0) in the same scenario should also likely not
 1474          * be a usage error as all fd above 3 are in-fact already closed.
 1475          */
 1476         if (highfd < lowfd) {
 1477                 return (EINVAL);
 1478         }
 1479 
 1480         if ((flags & CLOSE_RANGE_CLOEXEC) != 0)
 1481                 return (close_range_cloexec(td, lowfd, highfd));
 1482 
 1483         return (close_range_impl(td, lowfd, highfd));
 1484 }
 1485 
 1486 #ifndef _SYS_SYSPROTO_H_
 1487 struct close_range_args {
 1488         u_int   lowfd;
 1489         u_int   highfd;
 1490         int     flags;
 1491 };
 1492 #endif
 1493 int
 1494 sys_close_range(struct thread *td, struct close_range_args *uap)
 1495 {
 1496 
 1497         AUDIT_ARG_FD(uap->lowfd);
 1498         AUDIT_ARG_CMD(uap->highfd);
 1499         AUDIT_ARG_FFLAGS(uap->flags);
 1500 
 1501         if ((uap->flags & ~(CLOSE_RANGE_CLOEXEC)) != 0)
 1502                 return (EINVAL);
 1503         return (kern_close_range(td, uap->flags, uap->lowfd, uap->highfd));
 1504 }
 1505 
 1506 #ifdef COMPAT_FREEBSD12
 1507 /*
 1508  * Close open file descriptors.
 1509  */
 1510 #ifndef _SYS_SYSPROTO_H_
 1511 struct freebsd12_closefrom_args {
 1512         int     lowfd;
 1513 };
 1514 #endif
 1515 /* ARGSUSED */
 1516 int
 1517 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
 1518 {
 1519         u_int lowfd;
 1520 
 1521         AUDIT_ARG_FD(uap->lowfd);
 1522 
 1523         /*
 1524          * Treat negative starting file descriptor values identical to
 1525          * closefrom(0) which closes all files.
 1526          */
 1527         lowfd = MAX(0, uap->lowfd);
 1528         return (kern_close_range(td, 0, lowfd, ~0U));
 1529 }
 1530 #endif  /* COMPAT_FREEBSD12 */
 1531 
 1532 #if defined(COMPAT_43)
 1533 /*
 1534  * Return status information about a file descriptor.
 1535  */
 1536 #ifndef _SYS_SYSPROTO_H_
 1537 struct ofstat_args {
 1538         int     fd;
 1539         struct  ostat *sb;
 1540 };
 1541 #endif
 1542 /* ARGSUSED */
 1543 int
 1544 ofstat(struct thread *td, struct ofstat_args *uap)
 1545 {
 1546         struct ostat oub;
 1547         struct stat ub;
 1548         int error;
 1549 
 1550         error = kern_fstat(td, uap->fd, &ub);
 1551         if (error == 0) {
 1552                 cvtstat(&ub, &oub);
 1553                 error = copyout(&oub, uap->sb, sizeof(oub));
 1554         }
 1555         return (error);
 1556 }
 1557 #endif /* COMPAT_43 */
 1558 
 1559 #if defined(COMPAT_FREEBSD11)
 1560 int
 1561 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
 1562 {
 1563         struct stat sb;
 1564         struct freebsd11_stat osb;
 1565         int error;
 1566 
 1567         error = kern_fstat(td, uap->fd, &sb);
 1568         if (error != 0)
 1569                 return (error);
 1570         error = freebsd11_cvtstat(&sb, &osb);
 1571         if (error == 0)
 1572                 error = copyout(&osb, uap->sb, sizeof(osb));
 1573         return (error);
 1574 }
 1575 #endif  /* COMPAT_FREEBSD11 */
 1576 
 1577 /*
 1578  * Return status information about a file descriptor.
 1579  */
 1580 #ifndef _SYS_SYSPROTO_H_
 1581 struct fstat_args {
 1582         int     fd;
 1583         struct  stat *sb;
 1584 };
 1585 #endif
 1586 /* ARGSUSED */
 1587 int
 1588 sys_fstat(struct thread *td, struct fstat_args *uap)
 1589 {
 1590         struct stat ub;
 1591         int error;
 1592 
 1593         error = kern_fstat(td, uap->fd, &ub);
 1594         if (error == 0)
 1595                 error = copyout(&ub, uap->sb, sizeof(ub));
 1596         return (error);
 1597 }
 1598 
 1599 int
 1600 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 1601 {
 1602         struct file *fp;
 1603         int error;
 1604 
 1605         AUDIT_ARG_FD(fd);
 1606 
 1607         error = fget(td, fd, &cap_fstat_rights, &fp);
 1608         if (__predict_false(error != 0))
 1609                 return (error);
 1610 
 1611         AUDIT_ARG_FILE(td->td_proc, fp);
 1612 
 1613         error = fo_stat(fp, sbp, td->td_ucred, td);
 1614         fdrop(fp, td);
 1615 #ifdef __STAT_TIME_T_EXT
 1616         sbp->st_atim_ext = 0;
 1617         sbp->st_mtim_ext = 0;
 1618         sbp->st_ctim_ext = 0;
 1619         sbp->st_btim_ext = 0;
 1620 #endif
 1621 #ifdef KTRACE
 1622         if (KTRPOINT(td, KTR_STRUCT))
 1623                 ktrstat_error(sbp, error);
 1624 #endif
 1625         return (error);
 1626 }
 1627 
 1628 #if defined(COMPAT_FREEBSD11)
 1629 /*
 1630  * Return status information about a file descriptor.
 1631  */
 1632 #ifndef _SYS_SYSPROTO_H_
 1633 struct freebsd11_nfstat_args {
 1634         int     fd;
 1635         struct  nstat *sb;
 1636 };
 1637 #endif
 1638 /* ARGSUSED */
 1639 int
 1640 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
 1641 {
 1642         struct nstat nub;
 1643         struct stat ub;
 1644         int error;
 1645 
 1646         error = kern_fstat(td, uap->fd, &ub);
 1647         if (error == 0) {
 1648                 freebsd11_cvtnstat(&ub, &nub);
 1649                 error = copyout(&nub, uap->sb, sizeof(nub));
 1650         }
 1651         return (error);
 1652 }
 1653 #endif /* COMPAT_FREEBSD11 */
 1654 
 1655 /*
 1656  * Return pathconf information about a file descriptor.
 1657  */
 1658 #ifndef _SYS_SYSPROTO_H_
 1659 struct fpathconf_args {
 1660         int     fd;
 1661         int     name;
 1662 };
 1663 #endif
 1664 /* ARGSUSED */
 1665 int
 1666 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 1667 {
 1668         long value;
 1669         int error;
 1670 
 1671         error = kern_fpathconf(td, uap->fd, uap->name, &value);
 1672         if (error == 0)
 1673                 td->td_retval[0] = value;
 1674         return (error);
 1675 }
 1676 
 1677 int
 1678 kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
 1679 {
 1680         struct file *fp;
 1681         struct vnode *vp;
 1682         int error;
 1683 
 1684         error = fget(td, fd, &cap_fpathconf_rights, &fp);
 1685         if (error != 0)
 1686                 return (error);
 1687 
 1688         if (name == _PC_ASYNC_IO) {
 1689                 *valuep = _POSIX_ASYNCHRONOUS_IO;
 1690                 goto out;
 1691         }
 1692         vp = fp->f_vnode;
 1693         if (vp != NULL) {
 1694                 vn_lock(vp, LK_SHARED | LK_RETRY);
 1695                 error = VOP_PATHCONF(vp, name, valuep);
 1696                 VOP_UNLOCK(vp);
 1697         } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 1698                 if (name != _PC_PIPE_BUF) {
 1699                         error = EINVAL;
 1700                 } else {
 1701                         *valuep = PIPE_BUF;
 1702                         error = 0;
 1703                 }
 1704         } else {
 1705                 error = EOPNOTSUPP;
 1706         }
 1707 out:
 1708         fdrop(fp, td);
 1709         return (error);
 1710 }
 1711 
 1712 /*
 1713  * Copy filecaps structure allocating memory for ioctls array if needed.
 1714  *
 1715  * The last parameter indicates whether the fdtable is locked. If it is not and
 1716  * ioctls are encountered, copying fails and the caller must lock the table.
 1717  *
 1718  * Note that if the table was not locked, the caller has to check the relevant
 1719  * sequence counter to determine whether the operation was successful.
 1720  */
 1721 bool
 1722 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 1723 {
 1724         size_t size;
 1725 
 1726         if (src->fc_ioctls != NULL && !locked)
 1727                 return (false);
 1728         memcpy(dst, src, sizeof(*src));
 1729         if (src->fc_ioctls == NULL)
 1730                 return (true);
 1731 
 1732         KASSERT(src->fc_nioctls > 0,
 1733             ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 1734 
 1735         size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 1736         dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 1737         memcpy(dst->fc_ioctls, src->fc_ioctls, size);
 1738         return (true);
 1739 }
 1740 
 1741 static u_long *
 1742 filecaps_copy_prep(const struct filecaps *src)
 1743 {
 1744         u_long *ioctls;
 1745         size_t size;
 1746 
 1747         if (__predict_true(src->fc_ioctls == NULL))
 1748                 return (NULL);
 1749 
 1750         KASSERT(src->fc_nioctls > 0,
 1751             ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 1752 
 1753         size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 1754         ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 1755         return (ioctls);
 1756 }
 1757 
 1758 static void
 1759 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
 1760     u_long *ioctls)
 1761 {
 1762         size_t size;
 1763 
 1764         *dst = *src;
 1765         if (__predict_true(src->fc_ioctls == NULL)) {
 1766                 MPASS(ioctls == NULL);
 1767                 return;
 1768         }
 1769 
 1770         size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 1771         dst->fc_ioctls = ioctls;
 1772         bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 1773 }
 1774 
 1775 /*
 1776  * Move filecaps structure to the new place and clear the old place.
 1777  */
 1778 void
 1779 filecaps_move(struct filecaps *src, struct filecaps *dst)
 1780 {
 1781 
 1782         *dst = *src;
 1783         bzero(src, sizeof(*src));
 1784 }
 1785 
 1786 /*
 1787  * Fill the given filecaps structure with full rights.
 1788  */
 1789 static void
 1790 filecaps_fill(struct filecaps *fcaps)
 1791 {
 1792 
 1793         CAP_ALL(&fcaps->fc_rights);
 1794         fcaps->fc_ioctls = NULL;
 1795         fcaps->fc_nioctls = -1;
 1796         fcaps->fc_fcntls = CAP_FCNTL_ALL;
 1797 }
 1798 
 1799 /*
 1800  * Free memory allocated within filecaps structure.
 1801  */
 1802 static void
 1803 filecaps_free_ioctl(struct filecaps *fcaps)
 1804 {
 1805 
 1806         free(fcaps->fc_ioctls, M_FILECAPS);
 1807         fcaps->fc_ioctls = NULL;
 1808 }
 1809 
 1810 void
 1811 filecaps_free(struct filecaps *fcaps)
 1812 {
 1813 
 1814         filecaps_free_ioctl(fcaps);
 1815         bzero(fcaps, sizeof(*fcaps));
 1816 }
 1817 
 1818 static u_long *
 1819 filecaps_free_prep(struct filecaps *fcaps)
 1820 {
 1821         u_long *ioctls;
 1822 
 1823         ioctls = fcaps->fc_ioctls;
 1824         bzero(fcaps, sizeof(*fcaps));
 1825         return (ioctls);
 1826 }
 1827 
 1828 static void
 1829 filecaps_free_finish(u_long *ioctls)
 1830 {
 1831 
 1832         free(ioctls, M_FILECAPS);
 1833 }
 1834 
 1835 /*
 1836  * Validate the given filecaps structure.
 1837  */
 1838 static void
 1839 filecaps_validate(const struct filecaps *fcaps, const char *func)
 1840 {
 1841 
 1842         KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 1843             ("%s: invalid rights", func));
 1844         KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 1845             ("%s: invalid fcntls", func));
 1846         KASSERT(fcaps->fc_fcntls == 0 ||
 1847             cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 1848             ("%s: fcntls without CAP_FCNTL", func));
 1849         /*
 1850          * open calls without WANTIOCTLCAPS free caps but leave the counter
 1851          */
 1852 #if 0
 1853         KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 1854             (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 1855             ("%s: invalid ioctls", func));
 1856 #endif
 1857         KASSERT(fcaps->fc_nioctls == 0 ||
 1858             cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 1859             ("%s: ioctls without CAP_IOCTL", func));
 1860 }
 1861 
 1862 static void
 1863 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 1864 {
 1865         int nfd1;
 1866 
 1867         FILEDESC_XLOCK_ASSERT(fdp);
 1868 
 1869         nfd1 = fdp->fd_nfiles * 2;
 1870         if (nfd1 < nfd)
 1871                 nfd1 = nfd;
 1872         fdgrowtable(fdp, nfd1);
 1873 }
 1874 
 1875 /*
 1876  * Grow the file table to accommodate (at least) nfd descriptors.
 1877  */
 1878 static void
 1879 fdgrowtable(struct filedesc *fdp, int nfd)
 1880 {
 1881         struct filedesc0 *fdp0;
 1882         struct freetable *ft;
 1883         struct fdescenttbl *ntable;
 1884         struct fdescenttbl *otable;
 1885         int nnfiles, onfiles;
 1886         NDSLOTTYPE *nmap, *omap;
 1887 
 1888         KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 1889 
 1890         /* save old values */
 1891         onfiles = fdp->fd_nfiles;
 1892         otable = fdp->fd_files;
 1893         omap = fdp->fd_map;
 1894 
 1895         /* compute the size of the new table */
 1896         nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 1897         if (nnfiles <= onfiles)
 1898                 /* the table is already large enough */
 1899                 return;
 1900 
 1901         /*
 1902          * Allocate a new table.  We need enough space for the number of
 1903          * entries, file entries themselves and the struct freetable we will use
 1904          * when we decommission the table and place it on the freelist.
 1905          * We place the struct freetable in the middle so we don't have
 1906          * to worry about padding.
 1907          */
 1908         ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 1909             nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 1910             sizeof(struct freetable),
 1911             M_FILEDESC, M_ZERO | M_WAITOK);
 1912         /* copy the old data */
 1913         ntable->fdt_nfiles = nnfiles;
 1914         memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 1915             onfiles * sizeof(ntable->fdt_ofiles[0]));
 1916 
 1917         /*
 1918          * Allocate a new map only if the old is not large enough.  It will
 1919          * grow at a slower rate than the table as it can map more
 1920          * entries than the table can hold.
 1921          */
 1922         if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 1923                 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 1924                     M_ZERO | M_WAITOK);
 1925                 /* copy over the old data and update the pointer */
 1926                 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 1927                 fdp->fd_map = nmap;
 1928         }
 1929 
 1930         /*
 1931          * Make sure that ntable is correctly initialized before we replace
 1932          * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 1933          * data.
 1934          */
 1935         atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 1936 
 1937         /*
 1938          * Free the old file table when not shared by other threads or processes.
 1939          * The old file table is considered to be shared when either are true:
 1940          * - The process has more than one thread.
 1941          * - The file descriptor table has been shared via fdshare().
 1942          *
 1943          * When shared, the old file table will be placed on a freelist
 1944          * which will be processed when the struct filedesc is released.
 1945          *
 1946          * Note that if onfiles == NDFILE, we're dealing with the original
 1947          * static allocation contained within (struct filedesc0 *)fdp,
 1948          * which must not be freed.
 1949          */
 1950         if (onfiles > NDFILE) {
 1951                 /*
 1952                  * Note we may be called here from fdinit while allocating a
 1953                  * table for a new process in which case ->p_fd points
 1954                  * elsewhere.
 1955                  */
 1956                 if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
 1957                         free(otable, M_FILEDESC);
 1958                 } else {
 1959                         ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 1960                         fdp0 = (struct filedesc0 *)fdp;
 1961                         ft->ft_table = otable;
 1962                         SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 1963                 }
 1964         }
 1965         /*
 1966          * The map does not have the same possibility of threads still
 1967          * holding references to it.  So always free it as long as it
 1968          * does not reference the original static allocation.
 1969          */
 1970         if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 1971                 free(omap, M_FILEDESC);
 1972 }
 1973 
 1974 /*
 1975  * Allocate a file descriptor for the process.
 1976  */
 1977 int
 1978 fdalloc(struct thread *td, int minfd, int *result)
 1979 {
 1980         struct proc *p = td->td_proc;
 1981         struct filedesc *fdp = p->p_fd;
 1982         int fd, maxfd, allocfd;
 1983 #ifdef RACCT
 1984         int error;
 1985 #endif
 1986 
 1987         FILEDESC_XLOCK_ASSERT(fdp);
 1988 
 1989         if (fdp->fd_freefile > minfd)
 1990                 minfd = fdp->fd_freefile;
 1991 
 1992         maxfd = getmaxfd(td);
 1993 
 1994         /*
 1995          * Search the bitmap for a free descriptor starting at minfd.
 1996          * If none is found, grow the file table.
 1997          */
 1998         fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 1999         if (__predict_false(fd >= maxfd))
 2000                 return (EMFILE);
 2001         if (__predict_false(fd >= fdp->fd_nfiles)) {
 2002                 allocfd = min(fd * 2, maxfd);
 2003 #ifdef RACCT
 2004                 if (RACCT_ENABLED()) {
 2005                         error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
 2006                         if (error != 0)
 2007                                 return (EMFILE);
 2008                 }
 2009 #endif
 2010                 /*
 2011                  * fd is already equal to first free descriptor >= minfd, so
 2012                  * we only need to grow the table and we are done.
 2013                  */
 2014                 fdgrowtable_exp(fdp, allocfd);
 2015         }
 2016 
 2017         /*
 2018          * Perform some sanity checks, then mark the file descriptor as
 2019          * used and return it to the caller.
 2020          */
 2021         KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 2022             ("invalid descriptor %d", fd));
 2023         KASSERT(!fdisused(fdp, fd),
 2024             ("fd_first_free() returned non-free descriptor"));
 2025         KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 2026             ("file descriptor isn't free"));
 2027         fdused(fdp, fd);
 2028         *result = fd;
 2029         return (0);
 2030 }
 2031 
 2032 /*
 2033  * Allocate n file descriptors for the process.
 2034  */
 2035 int
 2036 fdallocn(struct thread *td, int minfd, int *fds, int n)
 2037 {
 2038         struct proc *p = td->td_proc;
 2039         struct filedesc *fdp = p->p_fd;
 2040         int i;
 2041 
 2042         FILEDESC_XLOCK_ASSERT(fdp);
 2043 
 2044         for (i = 0; i < n; i++)
 2045                 if (fdalloc(td, 0, &fds[i]) != 0)
 2046                         break;
 2047 
 2048         if (i < n) {
 2049                 for (i--; i >= 0; i--)
 2050                         fdunused(fdp, fds[i]);
 2051                 return (EMFILE);
 2052         }
 2053 
 2054         return (0);
 2055 }
 2056 
 2057 /*
 2058  * Create a new open file structure and allocate a file descriptor for the
 2059  * process that refers to it.  We add one reference to the file for the
 2060  * descriptor table and one reference for resultfp. This is to prevent us
 2061  * being preempted and the entry in the descriptor table closed after we
 2062  * release the FILEDESC lock.
 2063  */
 2064 int
 2065 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
 2066     struct filecaps *fcaps)
 2067 {
 2068         struct file *fp;
 2069         int error, fd;
 2070 
 2071         MPASS(resultfp != NULL);
 2072         MPASS(resultfd != NULL);
 2073 
 2074         error = _falloc_noinstall(td, &fp, 2);
 2075         if (__predict_false(error != 0)) {
 2076                 return (error);
 2077         }
 2078 
 2079         error = finstall_refed(td, fp, &fd, flags, fcaps);
 2080         if (__predict_false(error != 0)) {
 2081                 falloc_abort(td, fp);
 2082                 return (error);
 2083         }
 2084 
 2085         *resultfp = fp;
 2086         *resultfd = fd;
 2087 
 2088         return (0);
 2089 }
 2090 
 2091 /*
 2092  * Create a new open file structure without allocating a file descriptor.
 2093  */
 2094 int
 2095 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
 2096 {
 2097         struct file *fp;
 2098         int maxuserfiles = maxfiles - (maxfiles / 20);
 2099         int openfiles_new;
 2100         static struct timeval lastfail;
 2101         static int curfail;
 2102 
 2103         KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 2104         MPASS(n > 0);
 2105 
 2106         openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 2107         if ((openfiles_new >= maxuserfiles &&
 2108             priv_check(td, PRIV_MAXFILES) != 0) ||
 2109             openfiles_new >= maxfiles) {
 2110                 atomic_subtract_int(&openfiles, 1);
 2111                 if (ppsratecheck(&lastfail, &curfail, 1)) {
 2112                         printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 2113                             "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 2114                 }
 2115                 return (ENFILE);
 2116         }
 2117         fp = uma_zalloc(file_zone, M_WAITOK);
 2118         bzero(fp, sizeof(*fp));
 2119         refcount_init(&fp->f_count, n);
 2120         fp->f_cred = crhold(td->td_ucred);
 2121         fp->f_ops = &badfileops;
 2122         *resultfp = fp;
 2123         return (0);
 2124 }
 2125 
 2126 void
 2127 falloc_abort(struct thread *td, struct file *fp)
 2128 {
 2129 
 2130         /*
 2131          * For assertion purposes.
 2132          */
 2133         refcount_init(&fp->f_count, 0);
 2134         _fdrop(fp, td);
 2135 }
 2136 
 2137 /*
 2138  * Install a file in a file descriptor table.
 2139  */
 2140 void
 2141 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 2142     struct filecaps *fcaps)
 2143 {
 2144         struct filedescent *fde;
 2145 
 2146         MPASS(fp != NULL);
 2147         if (fcaps != NULL)
 2148                 filecaps_validate(fcaps, __func__);
 2149         FILEDESC_XLOCK_ASSERT(fdp);
 2150 
 2151         fde = &fdp->fd_ofiles[fd];
 2152 #ifdef CAPABILITIES
 2153         seqc_write_begin(&fde->fde_seqc);
 2154 #endif
 2155         fde->fde_file = fp;
 2156         fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 2157         if (fcaps != NULL)
 2158                 filecaps_move(fcaps, &fde->fde_caps);
 2159         else
 2160                 filecaps_fill(&fde->fde_caps);
 2161 #ifdef CAPABILITIES
 2162         seqc_write_end(&fde->fde_seqc);
 2163 #endif
 2164 }
 2165 
 2166 int
 2167 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
 2168     struct filecaps *fcaps)
 2169 {
 2170         struct filedesc *fdp = td->td_proc->p_fd;
 2171         int error;
 2172 
 2173         MPASS(fd != NULL);
 2174 
 2175         FILEDESC_XLOCK(fdp);
 2176         error = fdalloc(td, 0, fd);
 2177         if (__predict_true(error == 0)) {
 2178                 _finstall(fdp, fp, *fd, flags, fcaps);
 2179         }
 2180         FILEDESC_XUNLOCK(fdp);
 2181         return (error);
 2182 }
 2183 
 2184 int
 2185 finstall(struct thread *td, struct file *fp, int *fd, int flags,
 2186     struct filecaps *fcaps)
 2187 {
 2188         int error;
 2189 
 2190         MPASS(fd != NULL);
 2191 
 2192         if (!fhold(fp))
 2193                 return (EBADF);
 2194         error = finstall_refed(td, fp, fd, flags, fcaps);
 2195         if (__predict_false(error != 0)) {
 2196                 fdrop(fp, td);
 2197         }
 2198         return (error);
 2199 }
 2200 
 2201 /*
 2202  * Build a new filedesc structure from another.
 2203  *
 2204  * If fdp is not NULL, return with it shared locked.
 2205  */
 2206 struct filedesc *
 2207 fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile)
 2208 {
 2209         struct filedesc0 *newfdp0;
 2210         struct filedesc *newfdp;
 2211 
 2212         if (prepfiles)
 2213                 MPASS(lastfile != NULL);
 2214         else
 2215                 MPASS(lastfile == NULL);
 2216 
 2217         newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 2218         newfdp = &newfdp0->fd_fd;
 2219 
 2220         /* Create the file descriptor table. */
 2221         FILEDESC_LOCK_INIT(newfdp);
 2222         refcount_init(&newfdp->fd_refcnt, 1);
 2223         refcount_init(&newfdp->fd_holdcnt, 1);
 2224         newfdp->fd_map = newfdp0->fd_dmap;
 2225         newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 2226         newfdp->fd_files->fdt_nfiles = NDFILE;
 2227 
 2228         if (fdp == NULL)
 2229                 return (newfdp);
 2230 
 2231         FILEDESC_SLOCK(fdp);
 2232         if (!prepfiles) {
 2233                 FILEDESC_SUNLOCK(fdp);
 2234                 return (newfdp);
 2235         }
 2236 
 2237         for (;;) {
 2238                 *lastfile = fdlastfile(fdp);
 2239                 if (*lastfile < newfdp->fd_nfiles)
 2240                         break;
 2241                 FILEDESC_SUNLOCK(fdp);
 2242                 fdgrowtable(newfdp, *lastfile + 1);
 2243                 FILEDESC_SLOCK(fdp);
 2244         }
 2245 
 2246         return (newfdp);
 2247 }
 2248 
 2249 /*
 2250  * Build a pwddesc structure from another.
 2251  * Copy the current, root, and jail root vnode references.
 2252  *
 2253  * If pdp is not NULL, return with it shared locked.
 2254  */
 2255 struct pwddesc *
 2256 pdinit(struct pwddesc *pdp, bool keeplock)
 2257 {
 2258         struct pwddesc *newpdp;
 2259         struct pwd *newpwd;
 2260 
 2261         newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
 2262 
 2263         PWDDESC_LOCK_INIT(newpdp);
 2264         refcount_init(&newpdp->pd_refcount, 1);
 2265         newpdp->pd_cmask = CMASK;
 2266 
 2267         if (pdp == NULL) {
 2268                 newpwd = pwd_alloc();
 2269                 smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 2270                 return (newpdp);
 2271         }
 2272 
 2273         PWDDESC_XLOCK(pdp);
 2274         newpwd = pwd_hold_pwddesc(pdp);
 2275         smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 2276         if (!keeplock)
 2277                 PWDDESC_XUNLOCK(pdp);
 2278         return (newpdp);
 2279 }
 2280 
 2281 /*
 2282  * Hold either filedesc or pwddesc of the passed process.
 2283  *
 2284  * The process lock is used to synchronize against the target exiting and
 2285  * freeing the data.
 2286  *
 2287  * Clearing can be ilustrated in 3 steps:
 2288  * 1. set the pointer to NULL. Either routine can race against it, hence
 2289  *   atomic_load_ptr.
 2290  * 2. observe the process lock as not taken. Until then fdhold/pdhold can
 2291  *   race to either still see the pointer or find NULL. It is still safe to
 2292  *   grab a reference as clearing is stalled.
 2293  * 3. after the lock is observed as not taken, any fdhold/pdhold calls are
 2294  *   guaranteed to see NULL, making it safe to finish clearing
 2295  */
 2296 static struct filedesc *
 2297 fdhold(struct proc *p)
 2298 {
 2299         struct filedesc *fdp;
 2300 
 2301         PROC_LOCK_ASSERT(p, MA_OWNED);
 2302         fdp = atomic_load_ptr(&p->p_fd);
 2303         if (fdp != NULL)
 2304                 refcount_acquire(&fdp->fd_holdcnt);
 2305         return (fdp);
 2306 }
 2307 
 2308 static struct pwddesc *
 2309 pdhold(struct proc *p)
 2310 {
 2311         struct pwddesc *pdp;
 2312 
 2313         PROC_LOCK_ASSERT(p, MA_OWNED);
 2314         pdp = atomic_load_ptr(&p->p_pd);
 2315         if (pdp != NULL)
 2316                 refcount_acquire(&pdp->pd_refcount);
 2317         return (pdp);
 2318 }
 2319 
 2320 static void
 2321 fddrop(struct filedesc *fdp)
 2322 {
 2323 
 2324         if (refcount_load(&fdp->fd_holdcnt) > 1) {
 2325                 if (refcount_release(&fdp->fd_holdcnt) == 0)
 2326                         return;
 2327         }
 2328 
 2329         FILEDESC_LOCK_DESTROY(fdp);
 2330         uma_zfree(filedesc0_zone, fdp);
 2331 }
 2332 
 2333 static void
 2334 pddrop(struct pwddesc *pdp)
 2335 {
 2336         struct pwd *pwd;
 2337 
 2338         if (refcount_release_if_not_last(&pdp->pd_refcount))
 2339                 return;
 2340 
 2341         PWDDESC_XLOCK(pdp);
 2342         if (refcount_release(&pdp->pd_refcount) == 0) {
 2343                 PWDDESC_XUNLOCK(pdp);
 2344                 return;
 2345         }
 2346         pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 2347         pwd_set(pdp, NULL);
 2348         PWDDESC_XUNLOCK(pdp);
 2349         pwd_drop(pwd);
 2350 
 2351         PWDDESC_LOCK_DESTROY(pdp);
 2352         free(pdp, M_PWDDESC);
 2353 }
 2354 
 2355 /*
 2356  * Share a filedesc structure.
 2357  */
 2358 struct filedesc *
 2359 fdshare(struct filedesc *fdp)
 2360 {
 2361 
 2362         refcount_acquire(&fdp->fd_refcnt);
 2363         return (fdp);
 2364 }
 2365 
 2366 /*
 2367  * Share a pwddesc structure.
 2368  */
 2369 struct pwddesc *
 2370 pdshare(struct pwddesc *pdp)
 2371 {
 2372         refcount_acquire(&pdp->pd_refcount);
 2373         return (pdp);
 2374 }
 2375 
 2376 /*
 2377  * Unshare a filedesc structure, if necessary by making a copy
 2378  */
 2379 void
 2380 fdunshare(struct thread *td)
 2381 {
 2382         struct filedesc *tmp;
 2383         struct proc *p = td->td_proc;
 2384 
 2385         if (refcount_load(&p->p_fd->fd_refcnt) == 1)
 2386                 return;
 2387 
 2388         tmp = fdcopy(p->p_fd);
 2389         fdescfree(td);
 2390         p->p_fd = tmp;
 2391 }
 2392 
 2393 /*
 2394  * Unshare a pwddesc structure.
 2395  */
 2396 void
 2397 pdunshare(struct thread *td)
 2398 {
 2399         struct pwddesc *pdp;
 2400         struct proc *p;
 2401 
 2402         p = td->td_proc;
 2403         /* Not shared. */
 2404         if (p->p_pd->pd_refcount == 1)
 2405                 return;
 2406 
 2407         pdp = pdcopy(p->p_pd);
 2408         pdescfree(td);
 2409         p->p_pd = pdp;
 2410 }
 2411 
 2412 void
 2413 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 2414 {
 2415 
 2416         fdescfree(td);
 2417         td->td_proc->p_fd = fdp;
 2418 }
 2419 
 2420 /*
 2421  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
 2422  * this is to ease callers, not catch errors.
 2423  */
 2424 struct filedesc *
 2425 fdcopy(struct filedesc *fdp)
 2426 {
 2427         struct filedesc *newfdp;
 2428         struct filedescent *nfde, *ofde;
 2429         int i, lastfile;
 2430 
 2431         MPASS(fdp != NULL);
 2432 
 2433         newfdp = fdinit(fdp, true, &lastfile);
 2434         /* copy all passable descriptors (i.e. not kqueue) */
 2435         newfdp->fd_freefile = -1;
 2436         for (i = 0; i <= lastfile; ++i) {
 2437                 ofde = &fdp->fd_ofiles[i];
 2438                 if (ofde->fde_file == NULL ||
 2439                     (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
 2440                     !fhold(ofde->fde_file)) {
 2441                         if (newfdp->fd_freefile == -1)
 2442                                 newfdp->fd_freefile = i;
 2443                         continue;
 2444                 }
 2445                 nfde = &newfdp->fd_ofiles[i];
 2446                 *nfde = *ofde;
 2447                 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 2448                 fdused_init(newfdp, i);
 2449         }
 2450         if (newfdp->fd_freefile == -1)
 2451                 newfdp->fd_freefile = i;
 2452         FILEDESC_SUNLOCK(fdp);
 2453         return (newfdp);
 2454 }
 2455 
 2456 /*
 2457  * Copy a pwddesc structure.
 2458  */
 2459 struct pwddesc *
 2460 pdcopy(struct pwddesc *pdp)
 2461 {
 2462         struct pwddesc *newpdp;
 2463 
 2464         MPASS(pdp != NULL);
 2465 
 2466         newpdp = pdinit(pdp, true);
 2467         newpdp->pd_cmask = pdp->pd_cmask;
 2468         PWDDESC_XUNLOCK(pdp);
 2469         return (newpdp);
 2470 }
 2471 
 2472 /*
 2473  * Copies a filedesc structure, while remapping all file descriptors
 2474  * stored inside using a translation table.
 2475  *
 2476  * File descriptors are copied over to the new file descriptor table,
 2477  * regardless of whether the close-on-exec flag is set.
 2478  */
 2479 int
 2480 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
 2481     struct filedesc **ret)
 2482 {
 2483         struct filedesc *newfdp;
 2484         struct filedescent *nfde, *ofde;
 2485         int error, i, lastfile;
 2486 
 2487         MPASS(fdp != NULL);
 2488 
 2489         newfdp = fdinit(fdp, true, &lastfile);
 2490         if (nfds > lastfile + 1) {
 2491                 /* New table cannot be larger than the old one. */
 2492                 error = E2BIG;
 2493                 goto bad;
 2494         }
 2495         /* Copy all passable descriptors (i.e. not kqueue). */
 2496         newfdp->fd_freefile = nfds;
 2497         for (i = 0; i < nfds; ++i) {
 2498                 if (fds[i] < 0 || fds[i] > lastfile) {
 2499                         /* File descriptor out of bounds. */
 2500                         error = EBADF;
 2501                         goto bad;
 2502                 }
 2503                 ofde = &fdp->fd_ofiles[fds[i]];
 2504                 if (ofde->fde_file == NULL) {
 2505                         /* Unused file descriptor. */
 2506                         error = EBADF;
 2507                         goto bad;
 2508                 }
 2509                 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 2510                         /* File descriptor cannot be passed. */
 2511                         error = EINVAL;
 2512                         goto bad;
 2513                 }
 2514                 if (!fhold(ofde->fde_file)) {
 2515                         error = EBADF;
 2516                         goto bad;
 2517                 }
 2518                 nfde = &newfdp->fd_ofiles[i];
 2519                 *nfde = *ofde;
 2520                 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 2521                 fdused_init(newfdp, i);
 2522         }
 2523         FILEDESC_SUNLOCK(fdp);
 2524         *ret = newfdp;
 2525         return (0);
 2526 bad:
 2527         FILEDESC_SUNLOCK(fdp);
 2528         fdescfree_remapped(newfdp);
 2529         return (error);
 2530 }
 2531 
 2532 /*
 2533  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
 2534  * one of processes using it exits) and the table used to be shared.
 2535  */
 2536 static void
 2537 fdclearlocks(struct thread *td)
 2538 {
 2539         struct filedesc *fdp;
 2540         struct filedesc_to_leader *fdtol;
 2541         struct flock lf;
 2542         struct file *fp;
 2543         struct proc *p;
 2544         struct vnode *vp;
 2545         int i, lastfile;
 2546 
 2547         p = td->td_proc;
 2548         fdp = p->p_fd;
 2549         fdtol = p->p_fdtol;
 2550         MPASS(fdtol != NULL);
 2551 
 2552         FILEDESC_XLOCK(fdp);
 2553         KASSERT(fdtol->fdl_refcount > 0,
 2554             ("filedesc_to_refcount botch: fdl_refcount=%d",
 2555             fdtol->fdl_refcount));
 2556         if (fdtol->fdl_refcount == 1 &&
 2557             (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 2558                 lastfile = fdlastfile(fdp);
 2559                 for (i = 0; i <= lastfile; i++) {
 2560                         fp = fdp->fd_ofiles[i].fde_file;
 2561                         if (fp == NULL || fp->f_type != DTYPE_VNODE ||
 2562                             !fhold(fp))
 2563                                 continue;
 2564                         FILEDESC_XUNLOCK(fdp);
 2565                         lf.l_whence = SEEK_SET;
 2566                         lf.l_start = 0;
 2567                         lf.l_len = 0;
 2568                         lf.l_type = F_UNLCK;
 2569                         vp = fp->f_vnode;
 2570                         (void) VOP_ADVLOCK(vp,
 2571                             (caddr_t)p->p_leader, F_UNLCK,
 2572                             &lf, F_POSIX);
 2573                         FILEDESC_XLOCK(fdp);
 2574                         fdrop(fp, td);
 2575                 }
 2576         }
 2577 retry:
 2578         if (fdtol->fdl_refcount == 1) {
 2579                 if (fdp->fd_holdleaderscount > 0 &&
 2580                     (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 2581                         /*
 2582                          * close() or kern_dup() has cleared a reference
 2583                          * in a shared file descriptor table.
 2584                          */
 2585                         fdp->fd_holdleaderswakeup = 1;
 2586                         sx_sleep(&fdp->fd_holdleaderscount,
 2587                             FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 2588                         goto retry;
 2589                 }
 2590                 if (fdtol->fdl_holdcount > 0) {
 2591                         /*
 2592                          * Ensure that fdtol->fdl_leader remains
 2593                          * valid in closef().
 2594                          */
 2595                         fdtol->fdl_wakeup = 1;
 2596                         sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 2597                             "fdlhold", 0);
 2598                         goto retry;
 2599                 }
 2600         }
 2601         fdtol->fdl_refcount--;
 2602         if (fdtol->fdl_refcount == 0 &&
 2603             fdtol->fdl_holdcount == 0) {
 2604                 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 2605                 fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 2606         } else
 2607                 fdtol = NULL;
 2608         p->p_fdtol = NULL;
 2609         FILEDESC_XUNLOCK(fdp);
 2610         if (fdtol != NULL)
 2611                 free(fdtol, M_FILEDESC_TO_LEADER);
 2612 }
 2613 
 2614 /*
 2615  * Release a filedesc structure.
 2616  */
 2617 static void
 2618 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 2619 {
 2620         struct filedesc0 *fdp0;
 2621         struct freetable *ft, *tft;
 2622         struct filedescent *fde;
 2623         struct file *fp;
 2624         int i, lastfile;
 2625 
 2626         KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
 2627             ("%s: fd table %p carries references", __func__, fdp));
 2628 
 2629         /*
 2630          * Serialize with threads iterating over the table, if any.
 2631          */
 2632         if (refcount_load(&fdp->fd_holdcnt) > 1) {
 2633                 FILEDESC_XLOCK(fdp);
 2634                 FILEDESC_XUNLOCK(fdp);
 2635         }
 2636 
 2637         lastfile = fdlastfile_single(fdp);
 2638         for (i = 0; i <= lastfile; i++) {
 2639                 fde = &fdp->fd_ofiles[i];
 2640                 fp = fde->fde_file;
 2641                 if (fp != NULL) {
 2642                         fdefree_last(fde);
 2643                         if (needclose)
 2644                                 (void) closef(fp, td);
 2645                         else
 2646                                 fdrop(fp, td);
 2647                 }
 2648         }
 2649 
 2650         if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 2651                 free(fdp->fd_map, M_FILEDESC);
 2652         if (fdp->fd_nfiles > NDFILE)
 2653                 free(fdp->fd_files, M_FILEDESC);
 2654 
 2655         fdp0 = (struct filedesc0 *)fdp;
 2656         SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 2657                 free(ft->ft_table, M_FILEDESC);
 2658 
 2659         fddrop(fdp);
 2660 }
 2661 
 2662 void
 2663 fdescfree(struct thread *td)
 2664 {
 2665         struct proc *p;
 2666         struct filedesc *fdp;
 2667 
 2668         p = td->td_proc;
 2669         fdp = p->p_fd;
 2670         MPASS(fdp != NULL);
 2671 
 2672 #ifdef RACCT
 2673         if (RACCT_ENABLED())
 2674                 racct_set_unlocked(p, RACCT_NOFILE, 0);
 2675 #endif
 2676 
 2677         if (p->p_fdtol != NULL)
 2678                 fdclearlocks(td);
 2679 
 2680         /*
 2681          * Check fdhold for an explanation.
 2682          */
 2683         atomic_store_ptr(&p->p_fd, NULL);
 2684         atomic_thread_fence_seq_cst();
 2685         PROC_WAIT_UNLOCKED(p);
 2686 
 2687         if (refcount_release(&fdp->fd_refcnt) == 0)
 2688                 return;
 2689 
 2690         fdescfree_fds(td, fdp, 1);
 2691 }
 2692 
 2693 void
 2694 pdescfree(struct thread *td)
 2695 {
 2696         struct proc *p;
 2697         struct pwddesc *pdp;
 2698 
 2699         p = td->td_proc;
 2700         pdp = p->p_pd;
 2701         MPASS(pdp != NULL);
 2702 
 2703         /*
 2704          * Check pdhold for an explanation.
 2705          */
 2706         atomic_store_ptr(&p->p_pd, NULL);
 2707         atomic_thread_fence_seq_cst();
 2708         PROC_WAIT_UNLOCKED(p);
 2709 
 2710         pddrop(pdp);
 2711 }
 2712 
 2713 void
 2714 fdescfree_remapped(struct filedesc *fdp)
 2715 {
 2716 #ifdef INVARIANTS
 2717         /* fdescfree_fds() asserts that fd_refcnt == 0. */
 2718         if (!refcount_release(&fdp->fd_refcnt))
 2719                 panic("%s: fd table %p has extra references", __func__, fdp);
 2720 #endif
 2721         fdescfree_fds(curthread, fdp, 0);
 2722 }
 2723 
 2724 /*
 2725  * For setugid programs, we don't want to people to use that setugidness
 2726  * to generate error messages which write to a file which otherwise would
 2727  * otherwise be off-limits to the process.  We check for filesystems where
 2728  * the vnode can change out from under us after execve (like [lin]procfs).
 2729  *
 2730  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
 2731  * sufficient.  We also don't check for setugidness since we know we are.
 2732  */
 2733 static bool
 2734 is_unsafe(struct file *fp)
 2735 {
 2736         struct vnode *vp;
 2737 
 2738         if (fp->f_type != DTYPE_VNODE)
 2739                 return (false);
 2740 
 2741         vp = fp->f_vnode;
 2742         return ((vp->v_vflag & VV_PROCDEP) != 0);
 2743 }
 2744 
 2745 /*
 2746  * Make this setguid thing safe, if at all possible.
 2747  */
 2748 void
 2749 fdsetugidsafety(struct thread *td)
 2750 {
 2751         struct filedesc *fdp;
 2752         struct file *fp;
 2753         int i;
 2754 
 2755         fdp = td->td_proc->p_fd;
 2756         KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 2757             ("the fdtable should not be shared"));
 2758         MPASS(fdp->fd_nfiles >= 3);
 2759         for (i = 0; i <= 2; i++) {
 2760                 fp = fdp->fd_ofiles[i].fde_file;
 2761                 if (fp != NULL && is_unsafe(fp)) {
 2762                         FILEDESC_XLOCK(fdp);
 2763                         knote_fdclose(td, i);
 2764                         /*
 2765                          * NULL-out descriptor prior to close to avoid
 2766                          * a race while close blocks.
 2767                          */
 2768                         fdfree(fdp, i);
 2769                         FILEDESC_XUNLOCK(fdp);
 2770                         (void) closef(fp, td);
 2771                 }
 2772         }
 2773 }
 2774 
 2775 /*
 2776  * If a specific file object occupies a specific file descriptor, close the
 2777  * file descriptor entry and drop a reference on the file object.  This is a
 2778  * convenience function to handle a subsequent error in a function that calls
 2779  * falloc() that handles the race that another thread might have closed the
 2780  * file descriptor out from under the thread creating the file object.
 2781  */
 2782 void
 2783 fdclose(struct thread *td, struct file *fp, int idx)
 2784 {
 2785         struct filedesc *fdp = td->td_proc->p_fd;
 2786 
 2787         FILEDESC_XLOCK(fdp);
 2788         if (fdp->fd_ofiles[idx].fde_file == fp) {
 2789                 fdfree(fdp, idx);
 2790                 FILEDESC_XUNLOCK(fdp);
 2791                 fdrop(fp, td);
 2792         } else
 2793                 FILEDESC_XUNLOCK(fdp);
 2794 }
 2795 
 2796 /*
 2797  * Close any files on exec?
 2798  */
 2799 void
 2800 fdcloseexec(struct thread *td)
 2801 {
 2802         struct filedesc *fdp;
 2803         struct filedescent *fde;
 2804         struct file *fp;
 2805         int i, lastfile;
 2806 
 2807         fdp = td->td_proc->p_fd;
 2808         KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 2809             ("the fdtable should not be shared"));
 2810         lastfile = fdlastfile_single(fdp);
 2811         for (i = 0; i <= lastfile; i++) {
 2812                 fde = &fdp->fd_ofiles[i];
 2813                 fp = fde->fde_file;
 2814                 if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 2815                     (fde->fde_flags & UF_EXCLOSE))) {
 2816                         FILEDESC_XLOCK(fdp);
 2817                         fdfree(fdp, i);
 2818                         (void) closefp(fdp, i, fp, td, false, false);
 2819                         FILEDESC_UNLOCK_ASSERT(fdp);
 2820                 }
 2821         }
 2822 }
 2823 
 2824 /*
 2825  * It is unsafe for set[ug]id processes to be started with file
 2826  * descriptors 0..2 closed, as these descriptors are given implicit
 2827  * significance in the Standard C library.  fdcheckstd() will create a
 2828  * descriptor referencing /dev/null for each of stdin, stdout, and
 2829  * stderr that is not already open.
 2830  */
 2831 int
 2832 fdcheckstd(struct thread *td)
 2833 {
 2834         struct filedesc *fdp;
 2835         register_t save;
 2836         int i, error, devnull;
 2837 
 2838         fdp = td->td_proc->p_fd;
 2839         KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 2840             ("the fdtable should not be shared"));
 2841         MPASS(fdp->fd_nfiles >= 3);
 2842         devnull = -1;
 2843         for (i = 0; i <= 2; i++) {
 2844                 if (fdp->fd_ofiles[i].fde_file != NULL)
 2845                         continue;
 2846 
 2847                 save = td->td_retval[0];
 2848                 if (devnull != -1) {
 2849                         error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 2850                 } else {
 2851                         error = kern_openat(td, AT_FDCWD, "/dev/null",
 2852                             UIO_SYSSPACE, O_RDWR, 0);
 2853                         if (error == 0) {
 2854                                 devnull = td->td_retval[0];
 2855                                 KASSERT(devnull == i, ("we didn't get our fd"));
 2856                         }
 2857                 }
 2858                 td->td_retval[0] = save;
 2859                 if (error != 0)
 2860                         return (error);
 2861         }
 2862         return (0);
 2863 }
 2864 
 2865 /*
 2866  * Internal form of close.  Decrement reference count on file structure.
 2867  * Note: td may be NULL when closing a file that was being passed in a
 2868  * message.
 2869  */
 2870 int
 2871 closef(struct file *fp, struct thread *td)
 2872 {
 2873         struct vnode *vp;
 2874         struct flock lf;
 2875         struct filedesc_to_leader *fdtol;
 2876         struct filedesc *fdp;
 2877 
 2878         MPASS(td != NULL);
 2879 
 2880         /*
 2881          * POSIX record locking dictates that any close releases ALL
 2882          * locks owned by this process.  This is handled by setting
 2883          * a flag in the unlock to free ONLY locks obeying POSIX
 2884          * semantics, and not to free BSD-style file locks.
 2885          * If the descriptor was in a message, POSIX-style locks
 2886          * aren't passed with the descriptor, and the thread pointer
 2887          * will be NULL.  Callers should be careful only to pass a
 2888          * NULL thread pointer when there really is no owning
 2889          * context that might have locks, or the locks will be
 2890          * leaked.
 2891          */
 2892         if (fp->f_type == DTYPE_VNODE) {
 2893                 vp = fp->f_vnode;
 2894                 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 2895                         lf.l_whence = SEEK_SET;
 2896                         lf.l_start = 0;
 2897                         lf.l_len = 0;
 2898                         lf.l_type = F_UNLCK;
 2899                         (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 2900                             F_UNLCK, &lf, F_POSIX);
 2901                 }
 2902                 fdtol = td->td_proc->p_fdtol;
 2903                 if (fdtol != NULL) {
 2904                         /*
 2905                          * Handle special case where file descriptor table is
 2906                          * shared between multiple process leaders.
 2907                          */
 2908                         fdp = td->td_proc->p_fd;
 2909                         FILEDESC_XLOCK(fdp);
 2910                         for (fdtol = fdtol->fdl_next;
 2911                             fdtol != td->td_proc->p_fdtol;
 2912                             fdtol = fdtol->fdl_next) {
 2913                                 if ((fdtol->fdl_leader->p_flag &
 2914                                     P_ADVLOCK) == 0)
 2915                                         continue;
 2916                                 fdtol->fdl_holdcount++;
 2917                                 FILEDESC_XUNLOCK(fdp);
 2918                                 lf.l_whence = SEEK_SET;
 2919                                 lf.l_start = 0;
 2920                                 lf.l_len = 0;
 2921                                 lf.l_type = F_UNLCK;
 2922                                 vp = fp->f_vnode;
 2923                                 (void) VOP_ADVLOCK(vp,
 2924                                     (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 2925                                     F_POSIX);
 2926                                 FILEDESC_XLOCK(fdp);
 2927                                 fdtol->fdl_holdcount--;
 2928                                 if (fdtol->fdl_holdcount == 0 &&
 2929                                     fdtol->fdl_wakeup != 0) {
 2930                                         fdtol->fdl_wakeup = 0;
 2931                                         wakeup(fdtol);
 2932                                 }
 2933                         }
 2934                         FILEDESC_XUNLOCK(fdp);
 2935                 }
 2936         }
 2937         return (fdrop_close(fp, td));
 2938 }
 2939 
 2940 /*
 2941  * Hack for file descriptor passing code.
 2942  */
 2943 void
 2944 closef_nothread(struct file *fp)
 2945 {
 2946 
 2947         fdrop(fp, NULL);
 2948 }
 2949 
 2950 /*
 2951  * Initialize the file pointer with the specified properties.
 2952  *
 2953  * The ops are set with release semantics to be certain that the flags, type,
 2954  * and data are visible when ops is.  This is to prevent ops methods from being
 2955  * called with bad data.
 2956  */
 2957 void
 2958 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 2959 {
 2960         fp->f_data = data;
 2961         fp->f_flag = flag;
 2962         fp->f_type = type;
 2963         atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 2964 }
 2965 
 2966 void
 2967 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
 2968 {
 2969         fp->f_seqcount[UIO_READ] = 1;
 2970         fp->f_seqcount[UIO_WRITE] = 1;
 2971         finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
 2972             data, ops);
 2973 }
 2974 
 2975 int
 2976 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 2977     struct file **fpp, struct filecaps *havecapsp)
 2978 {
 2979         struct filedescent *fde;
 2980         int error;
 2981 
 2982         FILEDESC_LOCK_ASSERT(fdp);
 2983 
 2984         *fpp = NULL;
 2985         fde = fdeget_locked(fdp, fd);
 2986         if (fde == NULL) {
 2987                 error = EBADF;
 2988                 goto out;
 2989         }
 2990 
 2991 #ifdef CAPABILITIES
 2992         error = cap_check(cap_rights_fde_inline(fde), needrightsp);
 2993         if (error != 0)
 2994                 goto out;
 2995 #endif
 2996 
 2997         if (havecapsp != NULL)
 2998                 filecaps_copy(&fde->fde_caps, havecapsp, true);
 2999 
 3000         *fpp = fde->fde_file;
 3001 
 3002         error = 0;
 3003 out:
 3004         return (error);
 3005 }
 3006 
 3007 int
 3008 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
 3009     struct file **fpp, struct filecaps *havecapsp)
 3010 {
 3011         struct filedesc *fdp = td->td_proc->p_fd;
 3012         int error;
 3013 #ifndef CAPABILITIES
 3014         error = fget_unlocked(fdp, fd, needrightsp, fpp);
 3015         if (havecapsp != NULL && error == 0)
 3016                 filecaps_fill(havecapsp);
 3017 #else
 3018         struct file *fp;
 3019         seqc_t seq;
 3020 
 3021         *fpp = NULL;
 3022         for (;;) {
 3023                 error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq);
 3024                 if (error != 0)
 3025                         return (error);
 3026 
 3027                 if (havecapsp != NULL) {
 3028                         if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 3029                             havecapsp, false)) {
 3030                                 fdrop(fp, td);
 3031                                 goto get_locked;
 3032                         }
 3033                 }
 3034 
 3035                 if (!fd_modified(fdp, fd, seq))
 3036                         break;
 3037                 fdrop(fp, td);
 3038         }
 3039 
 3040         *fpp = fp;
 3041         return (0);
 3042 
 3043 get_locked:
 3044         FILEDESC_SLOCK(fdp);
 3045         error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
 3046         if (error == 0 && !fhold(*fpp))
 3047                 error = EBADF;
 3048         FILEDESC_SUNLOCK(fdp);
 3049 #endif
 3050         return (error);
 3051 }
 3052 
 3053 #ifdef CAPABILITIES
 3054 int
 3055 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 3056 {
 3057         const struct filedescent *fde;
 3058         const struct fdescenttbl *fdt;
 3059         struct filedesc *fdp;
 3060         struct file *fp;
 3061         struct vnode *vp;
 3062         const cap_rights_t *haverights;
 3063         cap_rights_t rights;
 3064         seqc_t seq;
 3065 
 3066         VFS_SMR_ASSERT_ENTERED();
 3067 
 3068         rights = *ndp->ni_rightsneeded;
 3069         cap_rights_set_one(&rights, CAP_LOOKUP);
 3070 
 3071         fdp = curproc->p_fd;
 3072         fdt = fdp->fd_files;
 3073         if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 3074                 return (EBADF);
 3075         seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 3076         fde = &fdt->fdt_ofiles[fd];
 3077         haverights = cap_rights_fde_inline(fde);
 3078         fp = fde->fde_file;
 3079         if (__predict_false(fp == NULL))
 3080                 return (EAGAIN);
 3081         if (__predict_false(cap_check_inline_transient(haverights, &rights)))
 3082                 return (EAGAIN);
 3083         *fsearch = ((fp->f_flag & FSEARCH) != 0);
 3084         vp = fp->f_vnode;
 3085         if (__predict_false(vp == NULL)) {
 3086                 return (EAGAIN);
 3087         }
 3088         if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
 3089                 return (EAGAIN);
 3090         }
 3091         /*
 3092          * Use an acquire barrier to force re-reading of fdt so it is
 3093          * refreshed for verification.
 3094          */
 3095         atomic_thread_fence_acq();
 3096         fdt = fdp->fd_files;
 3097         if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 3098                 return (EAGAIN);
 3099         /*
 3100          * If file descriptor doesn't have all rights,
 3101          * all lookups relative to it must also be
 3102          * strictly relative.
 3103          *
 3104          * Not yet supported by fast path.
 3105          */
 3106         CAP_ALL(&rights);
 3107         if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 3108             ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 3109             ndp->ni_filecaps.fc_nioctls != -1) {
 3110 #ifdef notyet
 3111                 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 3112 #else
 3113                 return (EAGAIN);
 3114 #endif
 3115         }
 3116         *vpp = vp;
 3117         return (0);
 3118 }
 3119 #else
 3120 int
 3121 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 3122 {
 3123         const struct fdescenttbl *fdt;
 3124         struct filedesc *fdp;
 3125         struct file *fp;
 3126         struct vnode *vp;
 3127 
 3128         VFS_SMR_ASSERT_ENTERED();
 3129 
 3130         fdp = curproc->p_fd;
 3131         fdt = fdp->fd_files;
 3132         if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 3133                 return (EBADF);
 3134         fp = fdt->fdt_ofiles[fd].fde_file;
 3135         if (__predict_false(fp == NULL))
 3136                 return (EAGAIN);
 3137         *fsearch = ((fp->f_flag & FSEARCH) != 0);
 3138         vp = fp->f_vnode;
 3139         if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 3140                 return (EAGAIN);
 3141         }
 3142         /*
 3143          * Use an acquire barrier to force re-reading of fdt so it is
 3144          * refreshed for verification.
 3145          */
 3146         atomic_thread_fence_acq();
 3147         fdt = fdp->fd_files;
 3148         if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 3149                 return (EAGAIN);
 3150         filecaps_fill(&ndp->ni_filecaps);
 3151         *vpp = vp;
 3152         return (0);
 3153 }
 3154 #endif
 3155 
 3156 int
 3157 fgetvp_lookup(int fd, struct nameidata *ndp, struct vnode **vpp)
 3158 {
 3159         struct thread *td;
 3160         struct file *fp;
 3161         struct vnode *vp;
 3162         struct componentname *cnp;
 3163         cap_rights_t rights;
 3164         int error;
 3165 
 3166         td = curthread;
 3167         rights = *ndp->ni_rightsneeded;
 3168         cap_rights_set_one(&rights, CAP_LOOKUP);
 3169         cnp = &ndp->ni_cnd;
 3170 
 3171         error = fget_cap(td, ndp->ni_dirfd, &rights, &fp, &ndp->ni_filecaps);
 3172         if (__predict_false(error != 0))
 3173                 return (error);
 3174         if (__predict_false(fp->f_ops == &badfileops)) {
 3175                 error = EBADF;
 3176                 goto out_free;
 3177         }
 3178         vp = fp->f_vnode;
 3179         if (__predict_false(vp == NULL)) {
 3180                 error = ENOTDIR;
 3181                 goto out_free;
 3182         }
 3183         vrefact(vp);
 3184         /*
 3185          * XXX does not check for VDIR, handled by namei_setup
 3186          */
 3187         if ((fp->f_flag & FSEARCH) != 0)
 3188                 cnp->cn_flags |= NOEXECCHECK;
 3189         fdrop(fp, td);
 3190 
 3191 #ifdef CAPABILITIES
 3192         /*
 3193          * If file descriptor doesn't have all rights,
 3194          * all lookups relative to it must also be
 3195          * strictly relative.
 3196          */
 3197         CAP_ALL(&rights);
 3198         if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 3199             ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 3200             ndp->ni_filecaps.fc_nioctls != -1) {
 3201                 ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 3202                 ndp->ni_resflags |= NIRES_STRICTREL;
 3203         }
 3204 #endif
 3205 
 3206         /*
 3207          * TODO: avoid copying ioctl caps if it can be helped to begin with
 3208          */
 3209         if ((cnp->cn_flags & WANTIOCTLCAPS) == 0)
 3210                 filecaps_free_ioctl(&ndp->ni_filecaps);
 3211 
 3212         *vpp = vp;
 3213         return (0);
 3214 
 3215 out_free:
 3216         filecaps_free(&ndp->ni_filecaps);
 3217         fdrop(fp, td);
 3218         return (error);
 3219 }
 3220 
 3221 static int
 3222 fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 3223     struct file **fpp, seqc_t *seqp)
 3224 {
 3225 #ifdef CAPABILITIES
 3226         const struct filedescent *fde;
 3227 #endif
 3228         const struct fdescenttbl *fdt;
 3229         struct file *fp;
 3230 #ifdef CAPABILITIES
 3231         seqc_t seq;
 3232         cap_rights_t haverights;
 3233         int error;
 3234 #endif
 3235 
 3236         fdt = fdp->fd_files;
 3237         if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 3238                 return (EBADF);
 3239         /*
 3240          * Fetch the descriptor locklessly.  We avoid fdrop() races by
 3241          * never raising a refcount above 0.  To accomplish this we have
 3242          * to use a cmpset loop rather than an atomic_add.  The descriptor
 3243          * must be re-verified once we acquire a reference to be certain
 3244          * that the identity is still correct and we did not lose a race
 3245          * due to preemption.
 3246          */
 3247         for (;;) {
 3248 #ifdef CAPABILITIES
 3249                 seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 3250                 fde = &fdt->fdt_ofiles[fd];
 3251                 haverights = *cap_rights_fde_inline(fde);
 3252                 fp = fde->fde_file;
 3253                 if (!seqc_consistent(fd_seqc(fdt, fd), seq))
 3254                         continue;
 3255 #else
 3256                 fp = fdt->fdt_ofiles[fd].fde_file;
 3257 #endif
 3258                 if (fp == NULL)
 3259                         return (EBADF);
 3260 #ifdef CAPABILITIES
 3261                 error = cap_check_inline(&haverights, needrightsp);
 3262                 if (error != 0)
 3263                         return (error);
 3264 #endif
 3265                 if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
 3266                         /*
 3267                          * Force a reload. Other thread could reallocate the
 3268                          * table before this fd was closed, so it is possible
 3269                          * that there is a stale fp pointer in cached version.
 3270                          */
 3271                         fdt = atomic_load_ptr(&fdp->fd_files);
 3272                         continue;
 3273                 }
 3274                 /*
 3275                  * Use an acquire barrier to force re-reading of fdt so it is
 3276                  * refreshed for verification.
 3277                  */
 3278                 atomic_thread_fence_acq();
 3279                 fdt = fdp->fd_files;
 3280 #ifdef  CAPABILITIES
 3281                 if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq))
 3282 #else
 3283                 if (fp == fdt->fdt_ofiles[fd].fde_file)
 3284 #endif
 3285                         break;
 3286                 fdrop(fp, curthread);
 3287         }
 3288         *fpp = fp;
 3289         if (seqp != NULL) {
 3290 #ifdef CAPABILITIES
 3291                 *seqp = seq;
 3292 #endif
 3293         }
 3294         return (0);
 3295 }
 3296 
 3297 /*
 3298  * See the comments in fget_unlocked_seq for an explanation of how this works.
 3299  *
 3300  * This is a simplified variant which bails out to the aforementioned routine
 3301  * if anything goes wrong. In practice this only happens when userspace is
 3302  * racing with itself.
 3303  */
 3304 int
 3305 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 3306     struct file **fpp)
 3307 {
 3308 #ifdef CAPABILITIES
 3309         const struct filedescent *fde;
 3310 #endif
 3311         const struct fdescenttbl *fdt;
 3312         struct file *fp;
 3313 #ifdef CAPABILITIES
 3314         seqc_t seq;
 3315         const cap_rights_t *haverights;
 3316 #endif
 3317 
 3318         fdt = fdp->fd_files;
 3319         if (__predict_false((u_int)fd >= fdt->fdt_nfiles)) {
 3320                 *fpp = NULL;
 3321                 return (EBADF);
 3322         }
 3323 #ifdef CAPABILITIES
 3324         seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 3325         fde = &fdt->fdt_ofiles[fd];
 3326         haverights = cap_rights_fde_inline(fde);
 3327         fp = fde->fde_file;
 3328 #else
 3329         fp = fdt->fdt_ofiles[fd].fde_file;
 3330 #endif
 3331         if (__predict_false(fp == NULL))
 3332                 goto out_fallback;
 3333 #ifdef CAPABILITIES
 3334         if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
 3335                 goto out_fallback;
 3336 #endif
 3337         if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
 3338                 goto out_fallback;
 3339 
 3340         /*
 3341          * Use an acquire barrier to force re-reading of fdt so it is
 3342          * refreshed for verification.
 3343          */
 3344         atomic_thread_fence_acq();
 3345         fdt = fdp->fd_files;
 3346 #ifdef  CAPABILITIES
 3347         if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 3348 #else
 3349         if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 3350 #endif
 3351                 goto out_fdrop;
 3352         *fpp = fp;
 3353         return (0);
 3354 out_fdrop:
 3355         fdrop(fp, curthread);
 3356 out_fallback:
 3357         *fpp = NULL;
 3358         return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL));
 3359 }
 3360 
 3361 /*
 3362  * Translate fd -> file when the caller guarantees the file descriptor table
 3363  * can't be changed by others.
 3364  *
 3365  * Note this does not mean the file object itself is only visible to the caller,
 3366  * merely that it wont disappear without having to be referenced.
 3367  *
 3368  * Must be paired with fput_only_user.
 3369  */
 3370 #ifdef  CAPABILITIES
 3371 int
 3372 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 3373     struct file **fpp)
 3374 {
 3375         const struct filedescent *fde;
 3376         const struct fdescenttbl *fdt;
 3377         const cap_rights_t *haverights;
 3378         struct file *fp;
 3379         int error;
 3380 
 3381         MPASS(FILEDESC_IS_ONLY_USER(fdp));
 3382 
 3383         *fpp = NULL;
 3384         if (__predict_false(fd >= fdp->fd_nfiles))
 3385                 return (EBADF);
 3386 
 3387         fdt = fdp->fd_files;
 3388         fde = &fdt->fdt_ofiles[fd];
 3389         fp = fde->fde_file;
 3390         if (__predict_false(fp == NULL))
 3391                 return (EBADF);
 3392         MPASS(refcount_load(&fp->f_count) > 0);
 3393         haverights = cap_rights_fde_inline(fde);
 3394         error = cap_check_inline(haverights, needrightsp);
 3395         if (__predict_false(error != 0))
 3396                 return (error);
 3397         *fpp = fp;
 3398         return (0);
 3399 }
 3400 #else
 3401 int
 3402 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 3403     struct file **fpp)
 3404 {
 3405         struct file *fp;
 3406 
 3407         MPASS(FILEDESC_IS_ONLY_USER(fdp));
 3408 
 3409         *fpp = NULL;
 3410         if (__predict_false(fd >= fdp->fd_nfiles))
 3411                 return (EBADF);
 3412 
 3413         fp = fdp->fd_ofiles[fd].fde_file;
 3414         if (__predict_false(fp == NULL))
 3415                 return (EBADF);
 3416 
 3417         MPASS(refcount_load(&fp->f_count) > 0);
 3418         *fpp = fp;
 3419         return (0);
 3420 }
 3421 #endif
 3422 
 3423 /*
 3424  * Extract the file pointer associated with the specified descriptor for the
 3425  * current user process.
 3426  *
 3427  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
 3428  * returned.
 3429  *
 3430  * File's rights will be checked against the capability rights mask.
 3431  *
 3432  * If an error occurred the non-zero error is returned and *fpp is set to
 3433  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
 3434  * responsible for fdrop().
 3435  */
 3436 static __inline int
 3437 _fget(struct thread *td, int fd, struct file **fpp, int flags,
 3438     cap_rights_t *needrightsp)
 3439 {
 3440         struct filedesc *fdp;
 3441         struct file *fp;
 3442         int error;
 3443 
 3444         *fpp = NULL;
 3445         fdp = td->td_proc->p_fd;
 3446         error = fget_unlocked(fdp, fd, needrightsp, &fp);
 3447         if (__predict_false(error != 0))
 3448                 return (error);
 3449         if (__predict_false(fp->f_ops == &badfileops)) {
 3450                 fdrop(fp, td);
 3451                 return (EBADF);
 3452         }
 3453 
 3454         /*
 3455          * FREAD and FWRITE failure return EBADF as per POSIX.
 3456          */
 3457         error = 0;
 3458         switch (flags) {
 3459         case FREAD:
 3460         case FWRITE:
 3461                 if ((fp->f_flag & flags) == 0)
 3462                         error = EBADF;
 3463                 break;
 3464         case FEXEC:
 3465                 if (fp->f_ops != &path_fileops &&
 3466                     ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 3467                     (fp->f_flag & FWRITE) != 0))
 3468                         error = EBADF;
 3469                 break;
 3470         case 0:
 3471                 break;
 3472         default:
 3473                 KASSERT(0, ("wrong flags"));
 3474         }
 3475 
 3476         if (error != 0) {
 3477                 fdrop(fp, td);
 3478                 return (error);
 3479         }
 3480 
 3481         *fpp = fp;
 3482         return (0);
 3483 }
 3484 
 3485 int
 3486 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 3487 {
 3488 
 3489         return (_fget(td, fd, fpp, 0, rightsp));
 3490 }
 3491 
 3492 int
 3493 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
 3494     struct file **fpp)
 3495 {
 3496         int error;
 3497 #ifndef CAPABILITIES
 3498         error = _fget(td, fd, fpp, 0, rightsp);
 3499         if (maxprotp != NULL)
 3500                 *maxprotp = VM_PROT_ALL;
 3501         return (error);
 3502 #else
 3503         cap_rights_t fdrights;
 3504         struct filedesc *fdp;
 3505         struct file *fp;
 3506         seqc_t seq;
 3507 
 3508         *fpp = NULL;
 3509         fdp = td->td_proc->p_fd;
 3510         MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 3511         for (;;) {
 3512                 error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 3513                 if (__predict_false(error != 0))
 3514                         return (error);
 3515                 if (__predict_false(fp->f_ops == &badfileops)) {
 3516                         fdrop(fp, td);
 3517                         return (EBADF);
 3518                 }
 3519                 if (maxprotp != NULL)
 3520                         fdrights = *cap_rights(fdp, fd);
 3521                 if (!fd_modified(fdp, fd, seq))
 3522                         break;
 3523                 fdrop(fp, td);
 3524         }
 3525 
 3526         /*
 3527          * If requested, convert capability rights to access flags.
 3528          */
 3529         if (maxprotp != NULL)
 3530                 *maxprotp = cap_rights_to_vmprot(&fdrights);
 3531         *fpp = fp;
 3532         return (0);
 3533 #endif
 3534 }
 3535 
 3536 int
 3537 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 3538 {
 3539 
 3540         return (_fget(td, fd, fpp, FREAD, rightsp));
 3541 }
 3542 
 3543 int
 3544 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 3545 {
 3546 
 3547         return (_fget(td, fd, fpp, FWRITE, rightsp));
 3548 }
 3549 
 3550 int
 3551 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
 3552     struct file **fpp)
 3553 {
 3554         struct filedesc *fdp = td->td_proc->p_fd;
 3555 #ifndef CAPABILITIES
 3556         return (fget_unlocked(fdp, fd, rightsp, fpp));
 3557 #else
 3558         struct file *fp;
 3559         int error;
 3560         seqc_t seq;
 3561 
 3562         *fpp = NULL;
 3563         MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 3564         for (;;) {
 3565                 error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 3566                 if (error != 0)
 3567                         return (error);
 3568                 error = cap_fcntl_check(fdp, fd, needfcntl);
 3569                 if (!fd_modified(fdp, fd, seq))
 3570                         break;
 3571                 fdrop(fp, td);
 3572         }
 3573         if (error != 0) {
 3574                 fdrop(fp, td);
 3575                 return (error);
 3576         }
 3577         *fpp = fp;
 3578         return (0);
 3579 #endif
 3580 }
 3581 
 3582 /*
 3583  * Like fget() but loads the underlying vnode, or returns an error if the
 3584  * descriptor does not represent a vnode.  Note that pipes use vnodes but
 3585  * never have VM objects.  The returned vnode will be vref()'d.
 3586  *
 3587  * XXX: what about the unused flags ?
 3588  */
 3589 static __inline int
 3590 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
 3591     struct vnode **vpp)
 3592 {
 3593         struct file *fp;
 3594         int error;
 3595 
 3596         *vpp = NULL;
 3597         error = _fget(td, fd, &fp, flags, needrightsp);
 3598         if (error != 0)
 3599                 return (error);
 3600         if (fp->f_vnode == NULL) {
 3601                 error = EINVAL;
 3602         } else {
 3603                 *vpp = fp->f_vnode;
 3604                 vrefact(*vpp);
 3605         }
 3606         fdrop(fp, td);
 3607 
 3608         return (error);
 3609 }
 3610 
 3611 int
 3612 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 3613 {
 3614 
 3615         return (_fgetvp(td, fd, 0, rightsp, vpp));
 3616 }
 3617 
 3618 int
 3619 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
 3620     struct filecaps *havecaps, struct vnode **vpp)
 3621 {
 3622         struct filecaps caps;
 3623         struct file *fp;
 3624         int error;
 3625 
 3626         error = fget_cap(td, fd, needrightsp, &fp, &caps);
 3627         if (error != 0)
 3628                 return (error);
 3629         if (fp->f_ops == &badfileops) {
 3630                 error = EBADF;
 3631                 goto out;
 3632         }
 3633         if (fp->f_vnode == NULL) {
 3634                 error = EINVAL;
 3635                 goto out;
 3636         }
 3637 
 3638         *havecaps = caps;
 3639         *vpp = fp->f_vnode;
 3640         vrefact(*vpp);
 3641         fdrop(fp, td);
 3642 
 3643         return (0);
 3644 out:
 3645         filecaps_free(&caps);
 3646         fdrop(fp, td);
 3647         return (error);
 3648 }
 3649 
 3650 int
 3651 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 3652 {
 3653 
 3654         return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 3655 }
 3656 
 3657 int
 3658 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 3659 {
 3660 
 3661         return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 3662 }
 3663 
 3664 #ifdef notyet
 3665 int
 3666 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
 3667     struct vnode **vpp)
 3668 {
 3669 
 3670         return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 3671 }
 3672 #endif
 3673 
 3674 /*
 3675  * Handle the last reference to a file being closed.
 3676  *
 3677  * Without the noinline attribute clang keeps inlining the func thorough this
 3678  * file when fdrop is used.
 3679  */
 3680 int __noinline
 3681 _fdrop(struct file *fp, struct thread *td)
 3682 {
 3683         int error;
 3684 #ifdef INVARIANTS
 3685         int count;
 3686 
 3687         count = refcount_load(&fp->f_count);
 3688         if (count != 0)
 3689                 panic("fdrop: fp %p count %d", fp, count);
 3690 #endif
 3691         error = fo_close(fp, td);
 3692         atomic_subtract_int(&openfiles, 1);
 3693         crfree(fp->f_cred);
 3694         free(fp->f_advice, M_FADVISE);
 3695         uma_zfree(file_zone, fp);
 3696 
 3697         return (error);
 3698 }
 3699 
 3700 /*
 3701  * Apply an advisory lock on a file descriptor.
 3702  *
 3703  * Just attempt to get a record lock of the requested type on the entire file
 3704  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
 3705  */
 3706 #ifndef _SYS_SYSPROTO_H_
 3707 struct flock_args {
 3708         int     fd;
 3709         int     how;
 3710 };
 3711 #endif
 3712 /* ARGSUSED */
 3713 int
 3714 sys_flock(struct thread *td, struct flock_args *uap)
 3715 {
 3716         struct file *fp;
 3717         struct vnode *vp;
 3718         struct flock lf;
 3719         int error;
 3720 
 3721         error = fget(td, uap->fd, &cap_flock_rights, &fp);
 3722         if (error != 0)
 3723                 return (error);
 3724         error = EOPNOTSUPP;
 3725         if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
 3726                 goto done;
 3727         }
 3728         if (fp->f_ops == &path_fileops) {
 3729                 goto done;
 3730         }
 3731 
 3732         error = 0;
 3733         vp = fp->f_vnode;
 3734         lf.l_whence = SEEK_SET;
 3735         lf.l_start = 0;
 3736         lf.l_len = 0;
 3737         if (uap->how & LOCK_UN) {
 3738                 lf.l_type = F_UNLCK;
 3739                 atomic_clear_int(&fp->f_flag, FHASLOCK);
 3740                 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 3741                 goto done;
 3742         }
 3743         if (uap->how & LOCK_EX)
 3744                 lf.l_type = F_WRLCK;
 3745         else if (uap->how & LOCK_SH)
 3746                 lf.l_type = F_RDLCK;
 3747         else {
 3748                 error = EBADF;
 3749                 goto done;
 3750         }
 3751         atomic_set_int(&fp->f_flag, FHASLOCK);
 3752         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 3753             (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 3754 done:
 3755         fdrop(fp, td);
 3756         return (error);
 3757 }
 3758 /*
 3759  * Duplicate the specified descriptor to a free descriptor.
 3760  */
 3761 int
 3762 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
 3763     int openerror, int *indxp)
 3764 {
 3765         struct filedescent *newfde, *oldfde;
 3766         struct file *fp;
 3767         u_long *ioctls;
 3768         int error, indx;
 3769 
 3770         KASSERT(openerror == ENODEV || openerror == ENXIO,
 3771             ("unexpected error %d in %s", openerror, __func__));
 3772 
 3773         /*
 3774          * If the to-be-dup'd fd number is greater than the allowed number
 3775          * of file descriptors, or the fd to be dup'd has already been
 3776          * closed, then reject.
 3777          */
 3778         FILEDESC_XLOCK(fdp);
 3779         if ((fp = fget_locked(fdp, dfd)) == NULL) {
 3780                 FILEDESC_XUNLOCK(fdp);
 3781                 return (EBADF);
 3782         }
 3783 
 3784         error = fdalloc(td, 0, &indx);
 3785         if (error != 0) {
 3786                 FILEDESC_XUNLOCK(fdp);
 3787                 return (error);
 3788         }
 3789 
 3790         /*
 3791          * There are two cases of interest here.
 3792          *
 3793          * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 3794          *
 3795          * For ENXIO steal away the file structure from (dfd) and store it in
 3796          * (indx).  (dfd) is effectively closed by this operation.
 3797          */
 3798         switch (openerror) {
 3799         case ENODEV:
 3800                 /*
 3801                  * Check that the mode the file is being opened for is a
 3802                  * subset of the mode of the existing descriptor.
 3803                  */
 3804                 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 3805                         fdunused(fdp, indx);
 3806                         FILEDESC_XUNLOCK(fdp);
 3807                         return (EACCES);
 3808                 }
 3809                 if (!fhold(fp)) {
 3810                         fdunused(fdp, indx);
 3811                         FILEDESC_XUNLOCK(fdp);
 3812                         return (EBADF);
 3813                 }
 3814                 newfde = &fdp->fd_ofiles[indx];
 3815                 oldfde = &fdp->fd_ofiles[dfd];
 3816                 ioctls = filecaps_copy_prep(&oldfde->fde_caps);
 3817 #ifdef CAPABILITIES
 3818                 seqc_write_begin(&newfde->fde_seqc);
 3819 #endif
 3820                 memcpy(newfde, oldfde, fde_change_size);
 3821                 filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 3822                     ioctls);
 3823 #ifdef CAPABILITIES
 3824                 seqc_write_end(&newfde->fde_seqc);
 3825 #endif
 3826                 break;
 3827         case ENXIO:
 3828                 /*
 3829                  * Steal away the file pointer from dfd and stuff it into indx.
 3830                  */
 3831                 newfde = &fdp->fd_ofiles[indx];
 3832                 oldfde = &fdp->fd_ofiles[dfd];
 3833 #ifdef CAPABILITIES
 3834                 seqc_write_begin(&newfde->fde_seqc);
 3835 #endif
 3836                 memcpy(newfde, oldfde, fde_change_size);
 3837                 oldfde->fde_file = NULL;
 3838                 fdunused(fdp, dfd);
 3839 #ifdef CAPABILITIES
 3840                 seqc_write_end(&newfde->fde_seqc);
 3841 #endif
 3842                 break;
 3843         }
 3844         FILEDESC_XUNLOCK(fdp);
 3845         *indxp = indx;
 3846         return (0);
 3847 }
 3848 
 3849 /*
 3850  * This sysctl determines if we will allow a process to chroot(2) if it
 3851  * has a directory open:
 3852  *      0: disallowed for all processes.
 3853  *      1: allowed for processes that were not already chroot(2)'ed.
 3854  *      2: allowed for all processes.
 3855  */
 3856 
 3857 static int chroot_allow_open_directories = 1;
 3858 
 3859 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
 3860     &chroot_allow_open_directories, 0,
 3861     "Allow a process to chroot(2) if it has a directory open");
 3862 
 3863 /*
 3864  * Helper function for raised chroot(2) security function:  Refuse if
 3865  * any filedescriptors are open directories.
 3866  */
 3867 static int
 3868 chroot_refuse_vdir_fds(struct filedesc *fdp)
 3869 {
 3870         struct vnode *vp;
 3871         struct file *fp;
 3872         int fd, lastfile;
 3873 
 3874         FILEDESC_LOCK_ASSERT(fdp);
 3875 
 3876         lastfile = fdlastfile(fdp);
 3877         for (fd = 0; fd <= lastfile; fd++) {
 3878                 fp = fget_locked(fdp, fd);
 3879                 if (fp == NULL)
 3880                         continue;
 3881                 if (fp->f_type == DTYPE_VNODE) {
 3882                         vp = fp->f_vnode;
 3883                         if (vp->v_type == VDIR)
 3884                                 return (EPERM);
 3885                 }
 3886         }
 3887         return (0);
 3888 }
 3889 
 3890 static void
 3891 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
 3892 {
 3893 
 3894         if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
 3895                 vrefact(oldpwd->pwd_cdir);
 3896                 newpwd->pwd_cdir = oldpwd->pwd_cdir;
 3897         }
 3898 
 3899         if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
 3900                 vrefact(oldpwd->pwd_rdir);
 3901                 newpwd->pwd_rdir = oldpwd->pwd_rdir;
 3902         }
 3903 
 3904         if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
 3905                 vrefact(oldpwd->pwd_jdir);
 3906                 newpwd->pwd_jdir = oldpwd->pwd_jdir;
 3907         }
 3908 }
 3909 
 3910 struct pwd *
 3911 pwd_hold_pwddesc(struct pwddesc *pdp)
 3912 {
 3913         struct pwd *pwd;
 3914 
 3915         PWDDESC_ASSERT_XLOCKED(pdp);
 3916         pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 3917         if (pwd != NULL)
 3918                 refcount_acquire(&pwd->pwd_refcount);
 3919         return (pwd);
 3920 }
 3921 
 3922 bool
 3923 pwd_hold_smr(struct pwd *pwd)
 3924 {
 3925 
 3926         MPASS(pwd != NULL);
 3927         if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
 3928                 return (true);
 3929         }
 3930         return (false);
 3931 }
 3932 
 3933 struct pwd *
 3934 pwd_hold(struct thread *td)
 3935 {
 3936         struct pwddesc *pdp;
 3937         struct pwd *pwd;
 3938 
 3939         pdp = td->td_proc->p_pd;
 3940 
 3941         vfs_smr_enter();
 3942         pwd = vfs_smr_entered_load(&pdp->pd_pwd);
 3943         if (pwd_hold_smr(pwd)) {
 3944                 vfs_smr_exit();
 3945                 return (pwd);
 3946         }
 3947         vfs_smr_exit();
 3948         PWDDESC_XLOCK(pdp);
 3949         pwd = pwd_hold_pwddesc(pdp);
 3950         MPASS(pwd != NULL);
 3951         PWDDESC_XUNLOCK(pdp);
 3952         return (pwd);
 3953 }
 3954 
 3955 struct pwd *
 3956 pwd_hold_proc(struct proc *p)
 3957 {
 3958         struct pwddesc *pdp;
 3959         struct pwd *pwd;
 3960 
 3961         PROC_ASSERT_HELD(p);
 3962         PROC_LOCK(p);
 3963         pdp = pdhold(p);
 3964         MPASS(pdp != NULL);
 3965         PROC_UNLOCK(p);
 3966 
 3967         PWDDESC_XLOCK(pdp);
 3968         pwd = pwd_hold_pwddesc(pdp);
 3969         MPASS(pwd != NULL);
 3970         PWDDESC_XUNLOCK(pdp);
 3971         pddrop(pdp);
 3972         return (pwd);
 3973 }
 3974 
 3975 static struct pwd *
 3976 pwd_alloc(void)
 3977 {
 3978         struct pwd *pwd;
 3979 
 3980         pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
 3981         bzero(pwd, sizeof(*pwd));
 3982         refcount_init(&pwd->pwd_refcount, 1);
 3983         return (pwd);
 3984 }
 3985 
 3986 void
 3987 pwd_drop(struct pwd *pwd)
 3988 {
 3989 
 3990         if (!refcount_release(&pwd->pwd_refcount))
 3991                 return;
 3992 
 3993         if (pwd->pwd_cdir != NULL)
 3994                 vrele(pwd->pwd_cdir);
 3995         if (pwd->pwd_rdir != NULL)
 3996                 vrele(pwd->pwd_rdir);
 3997         if (pwd->pwd_jdir != NULL)
 3998                 vrele(pwd->pwd_jdir);
 3999         uma_zfree_smr(pwd_zone, pwd);
 4000 }
 4001 
 4002 /*
 4003 * The caller is responsible for invoking priv_check() and
 4004 * mac_vnode_check_chroot() to authorize this operation.
 4005 */
 4006 int
 4007 pwd_chroot(struct thread *td, struct vnode *vp)
 4008 {
 4009         struct pwddesc *pdp;
 4010         struct filedesc *fdp;
 4011         struct pwd *newpwd, *oldpwd;
 4012         int error;
 4013 
 4014         fdp = td->td_proc->p_fd;
 4015         pdp = td->td_proc->p_pd;
 4016         newpwd = pwd_alloc();
 4017         FILEDESC_SLOCK(fdp);
 4018         PWDDESC_XLOCK(pdp);
 4019         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4020         if (chroot_allow_open_directories == 0 ||
 4021             (chroot_allow_open_directories == 1 &&
 4022             oldpwd->pwd_rdir != rootvnode)) {
 4023                 error = chroot_refuse_vdir_fds(fdp);
 4024                 FILEDESC_SUNLOCK(fdp);
 4025                 if (error != 0) {
 4026                         PWDDESC_XUNLOCK(pdp);
 4027                         pwd_drop(newpwd);
 4028                         return (error);
 4029                 }
 4030         } else {
 4031                 FILEDESC_SUNLOCK(fdp);
 4032         }
 4033 
 4034         vrefact(vp);
 4035         newpwd->pwd_rdir = vp;
 4036         if (oldpwd->pwd_jdir == NULL) {
 4037                 vrefact(vp);
 4038                 newpwd->pwd_jdir = vp;
 4039         }
 4040         pwd_fill(oldpwd, newpwd);
 4041         pwd_set(pdp, newpwd);
 4042         PWDDESC_XUNLOCK(pdp);
 4043         pwd_drop(oldpwd);
 4044         return (0);
 4045 }
 4046 
 4047 void
 4048 pwd_chdir(struct thread *td, struct vnode *vp)
 4049 {
 4050         struct pwddesc *pdp;
 4051         struct pwd *newpwd, *oldpwd;
 4052 
 4053         VNPASS(vp->v_usecount > 0, vp);
 4054 
 4055         newpwd = pwd_alloc();
 4056         pdp = td->td_proc->p_pd;
 4057         PWDDESC_XLOCK(pdp);
 4058         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4059         newpwd->pwd_cdir = vp;
 4060         pwd_fill(oldpwd, newpwd);
 4061         pwd_set(pdp, newpwd);
 4062         PWDDESC_XUNLOCK(pdp);
 4063         pwd_drop(oldpwd);
 4064 }
 4065 
 4066 /*
 4067  * jail_attach(2) changes both root and working directories.
 4068  */
 4069 int
 4070 pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 4071 {
 4072         struct pwddesc *pdp;
 4073         struct filedesc *fdp;
 4074         struct pwd *newpwd, *oldpwd;
 4075         int error;
 4076 
 4077         fdp = td->td_proc->p_fd;
 4078         pdp = td->td_proc->p_pd;
 4079         newpwd = pwd_alloc();
 4080         FILEDESC_SLOCK(fdp);
 4081         PWDDESC_XLOCK(pdp);
 4082         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4083         error = chroot_refuse_vdir_fds(fdp);
 4084         FILEDESC_SUNLOCK(fdp);
 4085         if (error != 0) {
 4086                 PWDDESC_XUNLOCK(pdp);
 4087                 pwd_drop(newpwd);
 4088                 return (error);
 4089         }
 4090 
 4091         vrefact(vp);
 4092         newpwd->pwd_rdir = vp;
 4093         vrefact(vp);
 4094         newpwd->pwd_cdir = vp;
 4095         if (oldpwd->pwd_jdir == NULL) {
 4096                 vrefact(vp);
 4097                 newpwd->pwd_jdir = vp;
 4098         }
 4099         pwd_fill(oldpwd, newpwd);
 4100         pwd_set(pdp, newpwd);
 4101         PWDDESC_XUNLOCK(pdp);
 4102         pwd_drop(oldpwd);
 4103         return (0);
 4104 }
 4105 
 4106 void
 4107 pwd_ensure_dirs(void)
 4108 {
 4109         struct pwddesc *pdp;
 4110         struct pwd *oldpwd, *newpwd;
 4111 
 4112         pdp = curproc->p_pd;
 4113         PWDDESC_XLOCK(pdp);
 4114         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4115         if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
 4116                 PWDDESC_XUNLOCK(pdp);
 4117                 return;
 4118         }
 4119         PWDDESC_XUNLOCK(pdp);
 4120 
 4121         newpwd = pwd_alloc();
 4122         PWDDESC_XLOCK(pdp);
 4123         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4124         pwd_fill(oldpwd, newpwd);
 4125         if (newpwd->pwd_cdir == NULL) {
 4126                 vrefact(rootvnode);
 4127                 newpwd->pwd_cdir = rootvnode;
 4128         }
 4129         if (newpwd->pwd_rdir == NULL) {
 4130                 vrefact(rootvnode);
 4131                 newpwd->pwd_rdir = rootvnode;
 4132         }
 4133         pwd_set(pdp, newpwd);
 4134         PWDDESC_XUNLOCK(pdp);
 4135         pwd_drop(oldpwd);
 4136 }
 4137 
 4138 void
 4139 pwd_set_rootvnode(void)
 4140 {
 4141         struct pwddesc *pdp;
 4142         struct pwd *oldpwd, *newpwd;
 4143 
 4144         pdp = curproc->p_pd;
 4145 
 4146         newpwd = pwd_alloc();
 4147         PWDDESC_XLOCK(pdp);
 4148         oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4149         vrefact(rootvnode);
 4150         newpwd->pwd_cdir = rootvnode;
 4151         vrefact(rootvnode);
 4152         newpwd->pwd_rdir = rootvnode;
 4153         pwd_fill(oldpwd, newpwd);
 4154         pwd_set(pdp, newpwd);
 4155         PWDDESC_XUNLOCK(pdp);
 4156         pwd_drop(oldpwd);
 4157 }
 4158 
 4159 /*
 4160  * Scan all active processes and prisons to see if any of them have a current
 4161  * or root directory of `olddp'. If so, replace them with the new mount point.
 4162  */
 4163 void
 4164 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 4165 {
 4166         struct pwddesc *pdp;
 4167         struct pwd *newpwd, *oldpwd;
 4168         struct prison *pr;
 4169         struct proc *p;
 4170         int nrele;
 4171 
 4172         if (vrefcnt(olddp) == 1)
 4173                 return;
 4174         nrele = 0;
 4175         newpwd = pwd_alloc();
 4176         sx_slock(&allproc_lock);
 4177         FOREACH_PROC_IN_SYSTEM(p) {
 4178                 PROC_LOCK(p);
 4179                 pdp = pdhold(p);
 4180                 PROC_UNLOCK(p);
 4181                 if (pdp == NULL)
 4182                         continue;
 4183                 PWDDESC_XLOCK(pdp);
 4184                 oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4185                 if (oldpwd == NULL ||
 4186                     (oldpwd->pwd_cdir != olddp &&
 4187                     oldpwd->pwd_rdir != olddp &&
 4188                     oldpwd->pwd_jdir != olddp)) {
 4189                         PWDDESC_XUNLOCK(pdp);
 4190                         pddrop(pdp);
 4191                         continue;
 4192                 }
 4193                 if (oldpwd->pwd_cdir == olddp) {
 4194                         vrefact(newdp);
 4195                         newpwd->pwd_cdir = newdp;
 4196                 }
 4197                 if (oldpwd->pwd_rdir == olddp) {
 4198                         vrefact(newdp);
 4199                         newpwd->pwd_rdir = newdp;
 4200                 }
 4201                 if (oldpwd->pwd_jdir == olddp) {
 4202                         vrefact(newdp);
 4203                         newpwd->pwd_jdir = newdp;
 4204                 }
 4205                 pwd_fill(oldpwd, newpwd);
 4206                 pwd_set(pdp, newpwd);
 4207                 PWDDESC_XUNLOCK(pdp);
 4208                 pwd_drop(oldpwd);
 4209                 pddrop(pdp);
 4210                 newpwd = pwd_alloc();
 4211         }
 4212         sx_sunlock(&allproc_lock);
 4213         pwd_drop(newpwd);
 4214         if (rootvnode == olddp) {
 4215                 vrefact(newdp);
 4216                 rootvnode = newdp;
 4217                 nrele++;
 4218         }
 4219         mtx_lock(&prison0.pr_mtx);
 4220         if (prison0.pr_root == olddp) {
 4221                 vrefact(newdp);
 4222                 prison0.pr_root = newdp;
 4223                 nrele++;
 4224         }
 4225         mtx_unlock(&prison0.pr_mtx);
 4226         sx_slock(&allprison_lock);
 4227         TAILQ_FOREACH(pr, &allprison, pr_list) {
 4228                 mtx_lock(&pr->pr_mtx);
 4229                 if (pr->pr_root == olddp) {
 4230                         vrefact(newdp);
 4231                         pr->pr_root = newdp;
 4232                         nrele++;
 4233                 }
 4234                 mtx_unlock(&pr->pr_mtx);
 4235         }
 4236         sx_sunlock(&allprison_lock);
 4237         while (nrele--)
 4238                 vrele(olddp);
 4239 }
 4240 
 4241 int
 4242 descrip_check_write_mp(struct filedesc *fdp, struct mount *mp)
 4243 {
 4244         struct file *fp;
 4245         struct vnode *vp;
 4246         int error, i, lastfile;
 4247 
 4248         error = 0;
 4249         FILEDESC_SLOCK(fdp);
 4250         lastfile = fdlastfile(fdp);
 4251         for (i = 0; i <= lastfile; i++) {
 4252                 fp = fdp->fd_ofiles[i].fde_file;
 4253                 if (fp->f_type != DTYPE_VNODE ||
 4254                     (atomic_load_int(&fp->f_flag) & FWRITE) == 0)
 4255                         continue;
 4256                 vp = fp->f_vnode;
 4257                 if (vp->v_mount == mp) {
 4258                         error = EDEADLK;
 4259                         break;
 4260                 }
 4261         }
 4262         FILEDESC_SUNLOCK(fdp);
 4263         return (error);
 4264 }
 4265 
 4266 struct filedesc_to_leader *
 4267 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp,
 4268     struct proc *leader)
 4269 {
 4270         struct filedesc_to_leader *fdtol;
 4271 
 4272         fdtol = malloc(sizeof(struct filedesc_to_leader),
 4273             M_FILEDESC_TO_LEADER, M_WAITOK);
 4274         fdtol->fdl_refcount = 1;
 4275         fdtol->fdl_holdcount = 0;
 4276         fdtol->fdl_wakeup = 0;
 4277         fdtol->fdl_leader = leader;
 4278         if (old != NULL) {
 4279                 FILEDESC_XLOCK(fdp);
 4280                 fdtol->fdl_next = old->fdl_next;
 4281                 fdtol->fdl_prev = old;
 4282                 old->fdl_next = fdtol;
 4283                 fdtol->fdl_next->fdl_prev = fdtol;
 4284                 FILEDESC_XUNLOCK(fdp);
 4285         } else {
 4286                 fdtol->fdl_next = fdtol;
 4287                 fdtol->fdl_prev = fdtol;
 4288         }
 4289         return (fdtol);
 4290 }
 4291 
 4292 struct filedesc_to_leader *
 4293 filedesc_to_leader_share(struct filedesc_to_leader *fdtol, struct filedesc *fdp)
 4294 {
 4295         FILEDESC_XLOCK(fdp);
 4296         fdtol->fdl_refcount++;
 4297         FILEDESC_XUNLOCK(fdp);
 4298         return (fdtol);
 4299 }
 4300 
 4301 static int
 4302 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 4303 {
 4304         NDSLOTTYPE *map;
 4305         struct filedesc *fdp;
 4306         u_int namelen;
 4307         int count, off, minoff;
 4308 
 4309         namelen = arg2;
 4310         if (namelen != 1)
 4311                 return (EINVAL);
 4312 
 4313         if (*(int *)arg1 != 0)
 4314                 return (EINVAL);
 4315 
 4316         fdp = curproc->p_fd;
 4317         count = 0;
 4318         FILEDESC_SLOCK(fdp);
 4319         map = fdp->fd_map;
 4320         off = NDSLOT(fdp->fd_nfiles - 1);
 4321         for (minoff = NDSLOT(0); off >= minoff; --off)
 4322                 count += bitcountl(map[off]);
 4323         FILEDESC_SUNLOCK(fdp);
 4324 
 4325         return (SYSCTL_OUT(req, &count, sizeof(count)));
 4326 }
 4327 
 4328 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
 4329     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
 4330     "Number of open file descriptors");
 4331 
 4332 /*
 4333  * Get file structures globally.
 4334  */
 4335 static int
 4336 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 4337 {
 4338         struct xfile xf;
 4339         struct filedesc *fdp;
 4340         struct file *fp;
 4341         struct proc *p;
 4342         int error, n, lastfile;
 4343 
 4344         error = sysctl_wire_old_buffer(req, 0);
 4345         if (error != 0)
 4346                 return (error);
 4347         if (req->oldptr == NULL) {
 4348                 n = 0;
 4349                 sx_slock(&allproc_lock);
 4350                 FOREACH_PROC_IN_SYSTEM(p) {
 4351                         PROC_LOCK(p);
 4352                         if (p->p_state == PRS_NEW) {
 4353                                 PROC_UNLOCK(p);
 4354                                 continue;
 4355                         }
 4356                         fdp = fdhold(p);
 4357                         PROC_UNLOCK(p);
 4358                         if (fdp == NULL)
 4359                                 continue;
 4360                         /* overestimates sparse tables. */
 4361                         n += fdp->fd_nfiles;
 4362                         fddrop(fdp);
 4363                 }
 4364                 sx_sunlock(&allproc_lock);
 4365                 return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 4366         }
 4367         error = 0;
 4368         bzero(&xf, sizeof(xf));
 4369         xf.xf_size = sizeof(xf);
 4370         sx_slock(&allproc_lock);
 4371         FOREACH_PROC_IN_SYSTEM(p) {
 4372                 PROC_LOCK(p);
 4373                 if (p->p_state == PRS_NEW) {
 4374                         PROC_UNLOCK(p);
 4375                         continue;
 4376                 }
 4377                 if (p_cansee(req->td, p) != 0) {
 4378                         PROC_UNLOCK(p);
 4379                         continue;
 4380                 }
 4381                 xf.xf_pid = p->p_pid;
 4382                 xf.xf_uid = p->p_ucred->cr_uid;
 4383                 fdp = fdhold(p);
 4384                 PROC_UNLOCK(p);
 4385                 if (fdp == NULL)
 4386                         continue;
 4387                 FILEDESC_SLOCK(fdp);
 4388                 if (refcount_load(&fdp->fd_refcnt) == 0)
 4389                         goto nextproc;
 4390                 lastfile = fdlastfile(fdp);
 4391                 for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile;
 4392                     n++) {
 4393                         if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 4394                                 continue;
 4395                         xf.xf_fd = n;
 4396                         xf.xf_file = (uintptr_t)fp;
 4397                         xf.xf_data = (uintptr_t)fp->f_data;
 4398                         xf.xf_vnode = (uintptr_t)fp->f_vnode;
 4399                         xf.xf_type = (uintptr_t)fp->f_type;
 4400                         xf.xf_count = refcount_load(&fp->f_count);
 4401                         xf.xf_msgcount = 0;
 4402                         xf.xf_offset = foffset_get(fp);
 4403                         xf.xf_flag = fp->f_flag;
 4404                         error = SYSCTL_OUT(req, &xf, sizeof(xf));
 4405 
 4406                         /*
 4407                          * There is no need to re-check the fdtable refcount
 4408                          * here since the filedesc lock is not dropped in the
 4409                          * loop body.
 4410                          */
 4411                         if (error != 0)
 4412                                 break;
 4413                 }
 4414 nextproc:
 4415                 FILEDESC_SUNLOCK(fdp);
 4416                 fddrop(fdp);
 4417                 if (error)
 4418                         break;
 4419         }
 4420         sx_sunlock(&allproc_lock);
 4421         return (error);
 4422 }
 4423 
 4424 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
 4425     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 4426 
 4427 #ifdef KINFO_FILE_SIZE
 4428 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 4429 #endif
 4430 
 4431 static int
 4432 xlate_fflags(int fflags)
 4433 {
 4434         static const struct {
 4435                 int     fflag;
 4436                 int     kf_fflag;
 4437         } fflags_table[] = {
 4438                 { FAPPEND, KF_FLAG_APPEND },
 4439                 { FASYNC, KF_FLAG_ASYNC },
 4440                 { FFSYNC, KF_FLAG_FSYNC },
 4441                 { FHASLOCK, KF_FLAG_HASLOCK },
 4442                 { FNONBLOCK, KF_FLAG_NONBLOCK },
 4443                 { FREAD, KF_FLAG_READ },
 4444                 { FWRITE, KF_FLAG_WRITE },
 4445                 { O_CREAT, KF_FLAG_CREAT },
 4446                 { O_DIRECT, KF_FLAG_DIRECT },
 4447                 { O_EXCL, KF_FLAG_EXCL },
 4448                 { O_EXEC, KF_FLAG_EXEC },
 4449                 { O_EXLOCK, KF_FLAG_EXLOCK },
 4450                 { O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 4451                 { O_SHLOCK, KF_FLAG_SHLOCK },
 4452                 { O_TRUNC, KF_FLAG_TRUNC }
 4453         };
 4454         unsigned int i;
 4455         int kflags;
 4456 
 4457         kflags = 0;
 4458         for (i = 0; i < nitems(fflags_table); i++)
 4459                 if (fflags & fflags_table[i].fflag)
 4460                         kflags |=  fflags_table[i].kf_fflag;
 4461         return (kflags);
 4462 }
 4463 
 4464 /* Trim unused data from kf_path by truncating the structure size. */
 4465 void
 4466 pack_kinfo(struct kinfo_file *kif)
 4467 {
 4468 
 4469         kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 4470             strlen(kif->kf_path) + 1;
 4471         kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 4472 }
 4473 
 4474 static void
 4475 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
 4476     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 4477 {
 4478         int error;
 4479 
 4480         bzero(kif, sizeof(*kif));
 4481 
 4482         /* Set a default type to allow for empty fill_kinfo() methods. */
 4483         kif->kf_type = KF_TYPE_UNKNOWN;
 4484         kif->kf_flags = xlate_fflags(fp->f_flag);
 4485         if (rightsp != NULL)
 4486                 kif->kf_cap_rights = *rightsp;
 4487         else
 4488                 cap_rights_init_zero(&kif->kf_cap_rights);
 4489         kif->kf_fd = fd;
 4490         kif->kf_ref_count = refcount_load(&fp->f_count);
 4491         kif->kf_offset = foffset_get(fp);
 4492 
 4493         /*
 4494          * This may drop the filedesc lock, so the 'fp' cannot be
 4495          * accessed after this call.
 4496          */
 4497         error = fo_fill_kinfo(fp, kif, fdp);
 4498         if (error == 0)
 4499                 kif->kf_status |= KF_ATTR_VALID;
 4500         if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 4501                 pack_kinfo(kif);
 4502         else
 4503                 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 4504 }
 4505 
 4506 static void
 4507 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
 4508     struct kinfo_file *kif, int flags)
 4509 {
 4510         int error;
 4511 
 4512         bzero(kif, sizeof(*kif));
 4513 
 4514         kif->kf_type = KF_TYPE_VNODE;
 4515         error = vn_fill_kinfo_vnode(vp, kif);
 4516         if (error == 0)
 4517                 kif->kf_status |= KF_ATTR_VALID;
 4518         kif->kf_flags = xlate_fflags(fflags);
 4519         cap_rights_init_zero(&kif->kf_cap_rights);
 4520         kif->kf_fd = fd;
 4521         kif->kf_ref_count = -1;
 4522         kif->kf_offset = -1;
 4523         if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 4524                 pack_kinfo(kif);
 4525         else
 4526                 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 4527         vrele(vp);
 4528 }
 4529 
 4530 struct export_fd_buf {
 4531         struct filedesc         *fdp;
 4532         struct pwddesc  *pdp;
 4533         struct sbuf             *sb;
 4534         ssize_t                 remainder;
 4535         struct kinfo_file       kif;
 4536         int                     flags;
 4537 };
 4538 
 4539 static int
 4540 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 4541 {
 4542         struct kinfo_file *kif;
 4543 
 4544         kif = &efbuf->kif;
 4545         if (efbuf->remainder != -1) {
 4546                 if (efbuf->remainder < kif->kf_structsize)
 4547                         return (ENOMEM);
 4548                 efbuf->remainder -= kif->kf_structsize;
 4549         }
 4550         if (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) != 0)
 4551                 return (sbuf_error(efbuf->sb));
 4552         return (0);
 4553 }
 4554 
 4555 static int
 4556 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
 4557     struct export_fd_buf *efbuf)
 4558 {
 4559         int error;
 4560 
 4561         if (efbuf->remainder == 0)
 4562                 return (ENOMEM);
 4563         export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 4564             efbuf->flags);
 4565         FILEDESC_SUNLOCK(efbuf->fdp);
 4566         error = export_kinfo_to_sb(efbuf);
 4567         FILEDESC_SLOCK(efbuf->fdp);
 4568         return (error);
 4569 }
 4570 
 4571 static int
 4572 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
 4573     struct export_fd_buf *efbuf)
 4574 {
 4575         int error;
 4576 
 4577         if (efbuf->remainder == 0)
 4578                 return (ENOMEM);
 4579         if (efbuf->pdp != NULL)
 4580                 PWDDESC_XUNLOCK(efbuf->pdp);
 4581         export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 4582         error = export_kinfo_to_sb(efbuf);
 4583         if (efbuf->pdp != NULL)
 4584                 PWDDESC_XLOCK(efbuf->pdp);
 4585         return (error);
 4586 }
 4587 
 4588 /*
 4589  * Store a process file descriptor information to sbuf.
 4590  *
 4591  * Takes a locked proc as argument, and returns with the proc unlocked.
 4592  */
 4593 int
 4594 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
 4595     int flags)
 4596 {
 4597         struct file *fp;
 4598         struct filedesc *fdp;
 4599         struct pwddesc *pdp;
 4600         struct export_fd_buf *efbuf;
 4601         struct vnode *cttyvp, *textvp, *tracevp;
 4602         struct pwd *pwd;
 4603         int error, i, lastfile;
 4604         cap_rights_t rights;
 4605 
 4606         PROC_LOCK_ASSERT(p, MA_OWNED);
 4607 
 4608         /* ktrace vnode */
 4609         tracevp = ktr_get_tracevp(p, true);
 4610         /* text vnode */
 4611         textvp = p->p_textvp;
 4612         if (textvp != NULL)
 4613                 vrefact(textvp);
 4614         /* Controlling tty. */
 4615         cttyvp = NULL;
 4616         if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 4617                 cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 4618                 if (cttyvp != NULL)
 4619                         vrefact(cttyvp);
 4620         }
 4621         fdp = fdhold(p);
 4622         pdp = pdhold(p);
 4623         PROC_UNLOCK(p);
 4624 
 4625         efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 4626         efbuf->fdp = NULL;
 4627         efbuf->pdp = NULL;
 4628         efbuf->sb = sb;
 4629         efbuf->remainder = maxlen;
 4630         efbuf->flags = flags;
 4631 
 4632         error = 0;
 4633         if (tracevp != NULL)
 4634                 error = export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE,
 4635                     FREAD | FWRITE, efbuf);
 4636         if (error == 0 && textvp != NULL)
 4637                 error = export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD,
 4638                     efbuf);
 4639         if (error == 0 && cttyvp != NULL)
 4640                 error = export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY,
 4641                     FREAD | FWRITE, efbuf);
 4642         if (error != 0 || pdp == NULL || fdp == NULL)
 4643                 goto fail;
 4644         efbuf->fdp = fdp;
 4645         efbuf->pdp = pdp;
 4646         PWDDESC_XLOCK(pdp);
 4647         pwd = pwd_hold_pwddesc(pdp);
 4648         if (pwd != NULL) {
 4649                 /* working directory */
 4650                 if (pwd->pwd_cdir != NULL) {
 4651                         vrefact(pwd->pwd_cdir);
 4652                         error = export_vnode_to_sb(pwd->pwd_cdir,
 4653                             KF_FD_TYPE_CWD, FREAD, efbuf);
 4654                 }
 4655                 /* root directory */
 4656                 if (error == 0 && pwd->pwd_rdir != NULL) {
 4657                         vrefact(pwd->pwd_rdir);
 4658                         error = export_vnode_to_sb(pwd->pwd_rdir,
 4659                             KF_FD_TYPE_ROOT, FREAD, efbuf);
 4660                 }
 4661                 /* jail directory */
 4662                 if (error == 0 && pwd->pwd_jdir != NULL) {
 4663                         vrefact(pwd->pwd_jdir);
 4664                         error = export_vnode_to_sb(pwd->pwd_jdir,
 4665                             KF_FD_TYPE_JAIL, FREAD, efbuf);
 4666                 }
 4667         }
 4668         PWDDESC_XUNLOCK(pdp);
 4669         if (error != 0)
 4670                 goto fail;
 4671         if (pwd != NULL)
 4672                 pwd_drop(pwd);
 4673         FILEDESC_SLOCK(fdp);
 4674         if (refcount_load(&fdp->fd_refcnt) == 0)
 4675                 goto skip;
 4676         lastfile = fdlastfile(fdp);
 4677         for (i = 0; i <= lastfile; i++) {
 4678                 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 4679                         continue;
 4680 #ifdef CAPABILITIES
 4681                 rights = *cap_rights(fdp, i);
 4682 #else /* !CAPABILITIES */
 4683                 rights = cap_no_rights;
 4684 #endif
 4685                 /*
 4686                  * Create sysctl entry.  It is OK to drop the filedesc
 4687                  * lock inside of export_file_to_sb() as we will
 4688                  * re-validate and re-evaluate its properties when the
 4689                  * loop continues.
 4690                  */
 4691                 error = export_file_to_sb(fp, i, &rights, efbuf);
 4692                 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
 4693                         break;
 4694         }
 4695 skip:
 4696         FILEDESC_SUNLOCK(fdp);
 4697 fail:
 4698         if (fdp != NULL)
 4699                 fddrop(fdp);
 4700         if (pdp != NULL)
 4701                 pddrop(pdp);
 4702         free(efbuf, M_TEMP);
 4703         return (error);
 4704 }
 4705 
 4706 #define FILEDESC_SBUF_SIZE      (sizeof(struct kinfo_file) * 5)
 4707 
 4708 /*
 4709  * Get per-process file descriptors for use by procstat(1), et al.
 4710  */
 4711 static int
 4712 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 4713 {
 4714         struct sbuf sb;
 4715         struct proc *p;
 4716         ssize_t maxlen;
 4717         u_int namelen;
 4718         int error, error2, *name;
 4719 
 4720         namelen = arg2;
 4721         if (namelen != 1)
 4722                 return (EINVAL);
 4723 
 4724         name = (int *)arg1;
 4725 
 4726         sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 4727         sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 4728         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 4729         if (error != 0) {
 4730                 sbuf_delete(&sb);
 4731                 return (error);
 4732         }
 4733         maxlen = req->oldptr != NULL ? req->oldlen : -1;
 4734         error = kern_proc_filedesc_out(p, &sb, maxlen,
 4735             KERN_FILEDESC_PACK_KINFO);
 4736         error2 = sbuf_finish(&sb);
 4737         sbuf_delete(&sb);
 4738         return (error != 0 ? error : error2);
 4739 }
 4740 
 4741 #ifdef COMPAT_FREEBSD7
 4742 #ifdef KINFO_OFILE_SIZE
 4743 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 4744 #endif
 4745 
 4746 static void
 4747 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 4748 {
 4749 
 4750         okif->kf_structsize = sizeof(*okif);
 4751         okif->kf_type = kif->kf_type;
 4752         okif->kf_fd = kif->kf_fd;
 4753         okif->kf_ref_count = kif->kf_ref_count;
 4754         okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 4755             KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 4756             KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 4757         okif->kf_offset = kif->kf_offset;
 4758         if (kif->kf_type == KF_TYPE_VNODE)
 4759                 okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
 4760         else
 4761                 okif->kf_vnode_type = KF_VTYPE_VNON;
 4762         strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 4763         if (kif->kf_type == KF_TYPE_SOCKET) {
 4764                 okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
 4765                 okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
 4766                 okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
 4767                 okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
 4768                 okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
 4769         } else {
 4770                 okif->kf_sa_local.ss_family = AF_UNSPEC;
 4771                 okif->kf_sa_peer.ss_family = AF_UNSPEC;
 4772         }
 4773 }
 4774 
 4775 static int
 4776 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
 4777     struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
 4778 {
 4779         int error;
 4780 
 4781         vrefact(vp);
 4782         PWDDESC_XUNLOCK(pdp);
 4783         export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 4784         kinfo_to_okinfo(kif, okif);
 4785         error = SYSCTL_OUT(req, okif, sizeof(*okif));
 4786         PWDDESC_XLOCK(pdp);
 4787         return (error);
 4788 }
 4789 
 4790 /*
 4791  * Get per-process file descriptors for use by procstat(1), et al.
 4792  */
 4793 static int
 4794 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 4795 {
 4796         struct kinfo_ofile *okif;
 4797         struct kinfo_file *kif;
 4798         struct filedesc *fdp;
 4799         struct pwddesc *pdp;
 4800         struct pwd *pwd;
 4801         u_int namelen;
 4802         int error, i, lastfile, *name;
 4803         struct file *fp;
 4804         struct proc *p;
 4805 
 4806         namelen = arg2;
 4807         if (namelen != 1)
 4808                 return (EINVAL);
 4809 
 4810         name = (int *)arg1;
 4811         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 4812         if (error != 0)
 4813                 return (error);
 4814         fdp = fdhold(p);
 4815         if (fdp != NULL)
 4816                 pdp = pdhold(p);
 4817         PROC_UNLOCK(p);
 4818         if (fdp == NULL || pdp == NULL) {
 4819                 if (fdp != NULL)
 4820                         fddrop(fdp);
 4821                 return (ENOENT);
 4822         }
 4823         kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 4824         okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 4825         PWDDESC_XLOCK(pdp);
 4826         pwd = pwd_hold_pwddesc(pdp);
 4827         if (pwd != NULL) {
 4828                 if (pwd->pwd_cdir != NULL)
 4829                         export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
 4830                             okif, pdp, req);
 4831                 if (pwd->pwd_rdir != NULL)
 4832                         export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
 4833                             okif, pdp, req);
 4834                 if (pwd->pwd_jdir != NULL)
 4835                         export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
 4836                             okif, pdp, req);
 4837         }
 4838         PWDDESC_XUNLOCK(pdp);
 4839         if (pwd != NULL)
 4840                 pwd_drop(pwd);
 4841         FILEDESC_SLOCK(fdp);
 4842         if (refcount_load(&fdp->fd_refcnt) == 0)
 4843                 goto skip;
 4844         lastfile = fdlastfile(fdp);
 4845         for (i = 0; i <= lastfile; i++) {
 4846                 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 4847                         continue;
 4848                 export_file_to_kinfo(fp, i, NULL, kif, fdp,
 4849                     KERN_FILEDESC_PACK_KINFO);
 4850                 FILEDESC_SUNLOCK(fdp);
 4851                 kinfo_to_okinfo(kif, okif);
 4852                 error = SYSCTL_OUT(req, okif, sizeof(*okif));
 4853                 FILEDESC_SLOCK(fdp);
 4854                 if (error != 0 || refcount_load(&fdp->fd_refcnt) == 0)
 4855                         break;
 4856         }
 4857 skip:
 4858         FILEDESC_SUNLOCK(fdp);
 4859         fddrop(fdp);
 4860         pddrop(pdp);
 4861         free(kif, M_TEMP);
 4862         free(okif, M_TEMP);
 4863         return (0);
 4864 }
 4865 
 4866 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
 4867     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
 4868     "Process ofiledesc entries");
 4869 #endif  /* COMPAT_FREEBSD7 */
 4870 
 4871 int
 4872 vntype_to_kinfo(int vtype)
 4873 {
 4874         struct {
 4875                 int     vtype;
 4876                 int     kf_vtype;
 4877         } vtypes_table[] = {
 4878                 { VBAD, KF_VTYPE_VBAD },
 4879                 { VBLK, KF_VTYPE_VBLK },
 4880                 { VCHR, KF_VTYPE_VCHR },
 4881                 { VDIR, KF_VTYPE_VDIR },
 4882                 { VFIFO, KF_VTYPE_VFIFO },
 4883                 { VLNK, KF_VTYPE_VLNK },
 4884                 { VNON, KF_VTYPE_VNON },
 4885                 { VREG, KF_VTYPE_VREG },
 4886                 { VSOCK, KF_VTYPE_VSOCK }
 4887         };
 4888         unsigned int i;
 4889 
 4890         /*
 4891          * Perform vtype translation.
 4892          */
 4893         for (i = 0; i < nitems(vtypes_table); i++)
 4894                 if (vtypes_table[i].vtype == vtype)
 4895                         return (vtypes_table[i].kf_vtype);
 4896 
 4897         return (KF_VTYPE_UNKNOWN);
 4898 }
 4899 
 4900 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
 4901     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
 4902     "Process filedesc entries");
 4903 
 4904 /*
 4905  * Store a process current working directory information to sbuf.
 4906  *
 4907  * Takes a locked proc as argument, and returns with the proc unlocked.
 4908  */
 4909 int
 4910 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 4911 {
 4912         struct pwddesc *pdp;
 4913         struct pwd *pwd;
 4914         struct export_fd_buf *efbuf;
 4915         struct vnode *cdir;
 4916         int error;
 4917 
 4918         PROC_LOCK_ASSERT(p, MA_OWNED);
 4919 
 4920         pdp = pdhold(p);
 4921         PROC_UNLOCK(p);
 4922         if (pdp == NULL)
 4923                 return (EINVAL);
 4924 
 4925         efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 4926         efbuf->fdp = NULL;
 4927         efbuf->pdp = pdp;
 4928         efbuf->sb = sb;
 4929         efbuf->remainder = maxlen;
 4930         efbuf->flags = 0;
 4931 
 4932         PWDDESC_XLOCK(pdp);
 4933         pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 4934         cdir = pwd->pwd_cdir;
 4935         if (cdir == NULL) {
 4936                 error = EINVAL;
 4937         } else {
 4938                 vrefact(cdir);
 4939                 error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 4940         }
 4941         PWDDESC_XUNLOCK(pdp);
 4942         pddrop(pdp);
 4943         free(efbuf, M_TEMP);
 4944         return (error);
 4945 }
 4946 
 4947 /*
 4948  * Get per-process current working directory.
 4949  */
 4950 static int
 4951 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 4952 {
 4953         struct sbuf sb;
 4954         struct proc *p;
 4955         ssize_t maxlen;
 4956         u_int namelen;
 4957         int error, error2, *name;
 4958 
 4959         namelen = arg2;
 4960         if (namelen != 1)
 4961                 return (EINVAL);
 4962 
 4963         name = (int *)arg1;
 4964 
 4965         sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 4966         sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 4967         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 4968         if (error != 0) {
 4969                 sbuf_delete(&sb);
 4970                 return (error);
 4971         }
 4972         maxlen = req->oldptr != NULL ? req->oldlen : -1;
 4973         error = kern_proc_cwd_out(p, &sb, maxlen);
 4974         error2 = sbuf_finish(&sb);
 4975         sbuf_delete(&sb);
 4976         return (error != 0 ? error : error2);
 4977 }
 4978 
 4979 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
 4980     sysctl_kern_proc_cwd, "Process current working directory");
 4981 
 4982 #ifdef DDB
 4983 /*
 4984  * For the purposes of debugging, generate a human-readable string for the
 4985  * file type.
 4986  */
 4987 static const char *
 4988 file_type_to_name(short type)
 4989 {
 4990 
 4991         switch (type) {
 4992         case 0:
 4993                 return ("zero");
 4994         case DTYPE_VNODE:
 4995                 return ("vnode");
 4996         case DTYPE_SOCKET:
 4997                 return ("socket");
 4998         case DTYPE_PIPE:
 4999                 return ("pipe");
 5000         case DTYPE_FIFO:
 5001                 return ("fifo");
 5002         case DTYPE_KQUEUE:
 5003                 return ("kqueue");
 5004         case DTYPE_CRYPTO:
 5005                 return ("crypto");
 5006         case DTYPE_MQUEUE:
 5007                 return ("mqueue");
 5008         case DTYPE_SHM:
 5009                 return ("shm");
 5010         case DTYPE_SEM:
 5011                 return ("ksem");
 5012         case DTYPE_PTS:
 5013                 return ("pts");
 5014         case DTYPE_DEV:
 5015                 return ("dev");
 5016         case DTYPE_PROCDESC:
 5017                 return ("proc");
 5018         case DTYPE_EVENTFD:
 5019                 return ("eventfd");
 5020         case DTYPE_LINUXTFD:
 5021                 return ("ltimer");
 5022         default:
 5023                 return ("unkn");
 5024         }
 5025 }
 5026 
 5027 /*
 5028  * For the purposes of debugging, identify a process (if any, perhaps one of
 5029  * many) that references the passed file in its file descriptor array. Return
 5030  * NULL if none.
 5031  */
 5032 static struct proc *
 5033 file_to_first_proc(struct file *fp)
 5034 {
 5035         struct filedesc *fdp;
 5036         struct proc *p;
 5037         int n;
 5038 
 5039         FOREACH_PROC_IN_SYSTEM(p) {
 5040                 if (p->p_state == PRS_NEW)
 5041                         continue;
 5042                 fdp = p->p_fd;
 5043                 if (fdp == NULL)
 5044                         continue;
 5045                 for (n = 0; n < fdp->fd_nfiles; n++) {
 5046                         if (fp == fdp->fd_ofiles[n].fde_file)
 5047                                 return (p);
 5048                 }
 5049         }
 5050         return (NULL);
 5051 }
 5052 
 5053 static void
 5054 db_print_file(struct file *fp, int header)
 5055 {
 5056 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
 5057         struct proc *p;
 5058 
 5059         if (header)
 5060                 db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
 5061                     XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
 5062                     "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
 5063                     "FCmd");
 5064         p = file_to_first_proc(fp);
 5065         db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
 5066             fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
 5067             fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
 5068             p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 5069 
 5070 #undef XPTRWIDTH
 5071 }
 5072 
 5073 DB_SHOW_COMMAND(file, db_show_file)
 5074 {
 5075         struct file *fp;
 5076 
 5077         if (!have_addr) {
 5078                 db_printf("usage: show file <addr>\n");
 5079                 return;
 5080         }
 5081         fp = (struct file *)addr;
 5082         db_print_file(fp, 1);
 5083 }
 5084 
 5085 DB_SHOW_COMMAND(files, db_show_files)
 5086 {
 5087         struct filedesc *fdp;
 5088         struct file *fp;
 5089         struct proc *p;
 5090         int header;
 5091         int n;
 5092 
 5093         header = 1;
 5094         FOREACH_PROC_IN_SYSTEM(p) {
 5095                 if (p->p_state == PRS_NEW)
 5096                         continue;
 5097                 if ((fdp = p->p_fd) == NULL)
 5098                         continue;
 5099                 for (n = 0; n < fdp->fd_nfiles; ++n) {
 5100                         if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 5101                                 continue;
 5102                         db_print_file(fp, header);
 5103                         header = 0;
 5104                 }
 5105         }
 5106 }
 5107 #endif
 5108 
 5109 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
 5110     &maxfilesperproc, 0, "Maximum files allowed open per process");
 5111 
 5112 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
 5113     &maxfiles, 0, "Maximum number of files");
 5114 
 5115 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
 5116     &openfiles, 0, "System-wide number of open files");
 5117 
 5118 /* ARGSUSED*/
 5119 static void
 5120 filelistinit(void *dummy)
 5121 {
 5122 
 5123         file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 5124             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 5125         filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 5126             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 5127         pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
 5128             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 5129         /*
 5130          * XXXMJG this is a temporary hack due to boot ordering issues against
 5131          * the vnode zone.
 5132          */
 5133         vfs_smr = uma_zone_get_smr(pwd_zone);
 5134         mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 5135 }
 5136 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 5137 
 5138 /*-------------------------------------------------------------------*/
 5139 
 5140 static int
 5141 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
 5142     int flags, struct thread *td)
 5143 {
 5144 
 5145         return (EBADF);
 5146 }
 5147 
 5148 static int
 5149 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 5150     struct thread *td)
 5151 {
 5152 
 5153         return (EINVAL);
 5154 }
 5155 
 5156 static int
 5157 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
 5158     struct thread *td)
 5159 {
 5160 
 5161         return (EBADF);
 5162 }
 5163 
 5164 static int
 5165 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
 5166     struct thread *td)
 5167 {
 5168 
 5169         return (0);
 5170 }
 5171 
 5172 static int
 5173 badfo_kqfilter(struct file *fp, struct knote *kn)
 5174 {
 5175 
 5176         return (EBADF);
 5177 }
 5178 
 5179 static int
 5180 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
 5181     struct thread *td)
 5182 {
 5183 
 5184         return (EBADF);
 5185 }
 5186 
 5187 static int
 5188 badfo_close(struct file *fp, struct thread *td)
 5189 {
 5190 
 5191         return (0);
 5192 }
 5193 
 5194 static int
 5195 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 5196     struct thread *td)
 5197 {
 5198 
 5199         return (EBADF);
 5200 }
 5201 
 5202 static int
 5203 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 5204     struct thread *td)
 5205 {
 5206 
 5207         return (EBADF);
 5208 }
 5209 
 5210 static int
 5211 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 5212     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
 5213     struct thread *td)
 5214 {
 5215 
 5216         return (EBADF);
 5217 }
 5218 
 5219 static int
 5220 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 5221 {
 5222 
 5223         return (0);
 5224 }
 5225 
 5226 struct fileops badfileops = {
 5227         .fo_read = badfo_readwrite,
 5228         .fo_write = badfo_readwrite,
 5229         .fo_truncate = badfo_truncate,
 5230         .fo_ioctl = badfo_ioctl,
 5231         .fo_poll = badfo_poll,
 5232         .fo_kqfilter = badfo_kqfilter,
 5233         .fo_stat = badfo_stat,
 5234         .fo_close = badfo_close,
 5235         .fo_chmod = badfo_chmod,
 5236         .fo_chown = badfo_chown,
 5237         .fo_sendfile = badfo_sendfile,
 5238         .fo_fill_kinfo = badfo_fill_kinfo,
 5239 };
 5240 
 5241 static int
 5242 path_poll(struct file *fp, int events, struct ucred *active_cred,
 5243     struct thread *td)
 5244 {
 5245         return (POLLNVAL);
 5246 }
 5247 
 5248 static int
 5249 path_close(struct file *fp, struct thread *td)
 5250 {
 5251         MPASS(fp->f_type == DTYPE_VNODE);
 5252         fp->f_ops = &badfileops;
 5253         vrele(fp->f_vnode);
 5254         return (0);
 5255 }
 5256 
 5257 struct fileops path_fileops = {
 5258         .fo_read = badfo_readwrite,
 5259         .fo_write = badfo_readwrite,
 5260         .fo_truncate = badfo_truncate,
 5261         .fo_ioctl = badfo_ioctl,
 5262         .fo_poll = path_poll,
 5263         .fo_kqfilter = vn_kqfilter_opath,
 5264         .fo_stat = vn_statfile,
 5265         .fo_close = path_close,
 5266         .fo_chmod = badfo_chmod,
 5267         .fo_chown = badfo_chown,
 5268         .fo_sendfile = badfo_sendfile,
 5269         .fo_fill_kinfo = vn_fill_kinfo,
 5270         .fo_flags = DFLAG_PASSABLE,
 5271 };
 5272 
 5273 int
 5274 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
 5275     int flags, struct thread *td)
 5276 {
 5277 
 5278         return (EOPNOTSUPP);
 5279 }
 5280 
 5281 int
 5282 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 5283     struct thread *td)
 5284 {
 5285 
 5286         return (EINVAL);
 5287 }
 5288 
 5289 int
 5290 invfo_ioctl(struct file *fp, u_long com, void *data,
 5291     struct ucred *active_cred, struct thread *td)
 5292 {
 5293 
 5294         return (ENOTTY);
 5295 }
 5296 
 5297 int
 5298 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
 5299     struct thread *td)
 5300 {
 5301 
 5302         return (poll_no_poll(events));
 5303 }
 5304 
 5305 int
 5306 invfo_kqfilter(struct file *fp, struct knote *kn)
 5307 {
 5308 
 5309         return (EINVAL);
 5310 }
 5311 
 5312 int
 5313 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 5314     struct thread *td)
 5315 {
 5316 
 5317         return (EINVAL);
 5318 }
 5319 
 5320 int
 5321 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 5322     struct thread *td)
 5323 {
 5324 
 5325         return (EINVAL);
 5326 }
 5327 
 5328 int
 5329 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 5330     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
 5331     struct thread *td)
 5332 {
 5333 
 5334         return (EINVAL);
 5335 }
 5336 
 5337 /*-------------------------------------------------------------------*/
 5338 
 5339 /*
 5340  * File Descriptor pseudo-device driver (/dev/fd/).
 5341  *
 5342  * Opening minor device N dup()s the file (if any) connected to file
 5343  * descriptor N belonging to the calling process.  Note that this driver
 5344  * consists of only the ``open()'' routine, because all subsequent
 5345  * references to this file will be direct to the other driver.
 5346  *
 5347  * XXX: we could give this one a cloning event handler if necessary.
 5348  */
 5349 
 5350 /* ARGSUSED */
 5351 static int
 5352 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 5353 {
 5354 
 5355         /*
 5356          * XXX Kludge: set curthread->td_dupfd to contain the value of the
 5357          * the file descriptor being sought for duplication. The error
 5358          * return ensures that the vnode for this device will be released
 5359          * by vn_open. Open will detect this special error and take the
 5360          * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 5361          * will simply report the error.
 5362          */
 5363         td->td_dupfd = dev2unit(dev);
 5364         return (ENODEV);
 5365 }
 5366 
 5367 static struct cdevsw fildesc_cdevsw = {
 5368         .d_version =    D_VERSION,
 5369         .d_open =       fdopen,
 5370         .d_name =       "FD",
 5371 };
 5372 
 5373 static void
 5374 fildesc_drvinit(void *unused)
 5375 {
 5376         struct cdev *dev;
 5377 
 5378         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 5379             UID_ROOT, GID_WHEEL, 0666, "fd/0");
 5380         make_dev_alias(dev, "stdin");
 5381         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 5382             UID_ROOT, GID_WHEEL, 0666, "fd/1");
 5383         make_dev_alias(dev, "stdout");
 5384         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 5385             UID_ROOT, GID_WHEEL, 0666, "fd/2");
 5386         make_dev_alias(dev, "stderr");
 5387 }
 5388 
 5389 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);

Cache object: bdcbf688be0ec7689b056522cc65fbc3


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.