kern_descrip.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1991, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)kern_descrip.c      8.6 (Berkeley) 4/19/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD$");
   39 
   40 #include "opt_capsicum.h"
   41 #include "opt_compat.h"
   42 #include "opt_ddb.h"
   43 #include "opt_ktrace.h"
   44 
   45 #include <sys/param.h>
   46 #include <sys/systm.h>
   47 
   48 #include <sys/capsicum.h>
   49 #include <sys/conf.h>
   50 #include <sys/fcntl.h>
   51 #include <sys/file.h>
   52 #include <sys/filedesc.h>
   53 #include <sys/filio.h>
   54 #include <sys/jail.h>
   55 #include <sys/kernel.h>
   56 #include <sys/limits.h>
   57 #include <sys/lock.h>
   58 #include <sys/malloc.h>
   59 #include <sys/mount.h>
   60 #include <sys/mutex.h>
   61 #include <sys/namei.h>
   62 #include <sys/selinfo.h>
   63 #include <sys/priv.h>
   64 #include <sys/proc.h>
   65 #include <sys/protosw.h>
   66 #include <sys/racct.h>
   67 #include <sys/resourcevar.h>
   68 #include <sys/sbuf.h>
   69 #include <sys/signalvar.h>
   70 #include <sys/kdb.h>
   71 #include <sys/stat.h>
   72 #include <sys/sx.h>
   73 #include <sys/syscallsubr.h>
   74 #include <sys/sysctl.h>
   75 #include <sys/sysproto.h>
   76 #include <sys/unistd.h>
   77 #include <sys/user.h>
   78 #include <sys/vnode.h>
   79 #ifdef KTRACE
   80 #include <sys/ktrace.h>
   81 #endif
   82 
   83 #include <net/vnet.h>
   84 
   85 #include <security/audit/audit.h>
   86 
   87 #include <vm/uma.h>
   88 #include <vm/vm.h>
   89 
   90 #include <ddb/ddb.h>
   91 
   92 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
   93 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
   94     "file desc to leader structures");
   95 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
   96 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
   97 
   98 MALLOC_DECLARE(M_FADVISE);
   99 
  100 static __read_mostly uma_zone_t file_zone;
  101 static __read_mostly uma_zone_t filedesc0_zone;
  102 
  103 static int      closefp(struct filedesc *fdp, int fd, struct file *fp,
  104                     struct thread *td, int holdleaders);
  105 static int      fd_first_free(struct filedesc *fdp, int low, int size);
  106 static int      fd_last_used(struct filedesc *fdp, int size);
  107 static void     fdgrowtable(struct filedesc *fdp, int nfd);
  108 static void     fdgrowtable_exp(struct filedesc *fdp, int nfd);
  109 static void     fdunused(struct filedesc *fdp, int fd);
  110 static void     fdused(struct filedesc *fdp, int fd);
  111 static int      getmaxfd(struct thread *td);
  112 
  113 /*
  114  * Each process has:
  115  *
  116  * - An array of open file descriptors (fd_ofiles)
  117  * - An array of file flags (fd_ofileflags)
  118  * - A bitmap recording which descriptors are in use (fd_map)
  119  *
  120  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  121  * been selected based the historical limit of 20 open files, and an
  122  * assumption that the majority of processes, especially short-lived
  123  * processes like shells, will never need more.
  124  *
  125  * If this initial allocation is exhausted, a larger descriptor table and
  126  * map are allocated dynamically, and the pointers in the process's struct
  127  * filedesc are updated to point to those.  This is repeated every time
  128  * the process runs out of file descriptors (provided it hasn't hit its
  129  * resource limit).
  130  *
  131  * Since threads may hold references to individual descriptor table
  132  * entries, the tables are never freed.  Instead, they are placed on a
  133  * linked list and freed only when the struct filedesc is released.
  134  */
  135 #define NDFILE          20
  136 #define NDSLOTSIZE      sizeof(NDSLOTTYPE)
  137 #define NDENTRIES       (NDSLOTSIZE * __CHAR_BIT)
  138 #define NDSLOT(x)       ((x) / NDENTRIES)
  139 #define NDBIT(x)        ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
  140 #define NDSLOTS(x)      (((x) + NDENTRIES - 1) / NDENTRIES)
  141 
  142 /*
  143  * SLIST entry used to keep track of ofiles which must be reclaimed when
  144  * the process exits.
  145  */
  146 struct freetable {
  147         struct fdescenttbl *ft_table;
  148         SLIST_ENTRY(freetable) ft_next;
  149 };
  150 
  151 /*
  152  * Initial allocation: a filedesc structure + the head of SLIST used to
  153  * keep track of old ofiles + enough space for NDFILE descriptors.
  154  */
  155 
  156 struct fdescenttbl0 {
  157         int     fdt_nfiles;
  158         struct  filedescent fdt_ofiles[NDFILE];
  159 };
  160 
  161 struct filedesc0 {
  162         struct filedesc fd_fd;
  163         SLIST_HEAD(, freetable) fd_free;
  164         struct  fdescenttbl0 fd_dfiles;
  165         NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
  166 };
  167 
  168 /*
  169  * Descriptor management.
  170  */
  171 volatile int __exclusive_cache_line openfiles; /* actual number of open files */
  172 struct mtx sigio_lock;          /* mtx to protect pointers to sigio */
  173 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
  174 
  175 /*
  176  * If low >= size, just return low. Otherwise find the first zero bit in the
  177  * given bitmap, starting at low and not exceeding size - 1. Return size if
  178  * not found.
  179  */
  180 static int
  181 fd_first_free(struct filedesc *fdp, int low, int size)
  182 {
  183         NDSLOTTYPE *map = fdp->fd_map;
  184         NDSLOTTYPE mask;
  185         int off, maxoff;
  186 
  187         if (low >= size)
  188                 return (low);
  189 
  190         off = NDSLOT(low);
  191         if (low % NDENTRIES) {
  192                 mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
  193                 if ((mask &= ~map[off]) != 0UL)
  194                         return (off * NDENTRIES + ffsl(mask) - 1);
  195                 ++off;
  196         }
  197         for (maxoff = NDSLOTS(size); off < maxoff; ++off)
  198                 if (map[off] != ~0UL)
  199                         return (off * NDENTRIES + ffsl(~map[off]) - 1);
  200         return (size);
  201 }
  202 
  203 /*
  204  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  205  * not exceeding size - 1. Return -1 if not found.
  206  */
  207 static int
  208 fd_last_used(struct filedesc *fdp, int size)
  209 {
  210         NDSLOTTYPE *map = fdp->fd_map;
  211         NDSLOTTYPE mask;
  212         int off, minoff;
  213 
  214         off = NDSLOT(size);
  215         if (size % NDENTRIES) {
  216                 mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
  217                 if ((mask &= map[off]) != 0)
  218                         return (off * NDENTRIES + flsl(mask) - 1);
  219                 --off;
  220         }
  221         for (minoff = NDSLOT(0); off >= minoff; --off)
  222                 if (map[off] != 0)
  223                         return (off * NDENTRIES + flsl(map[off]) - 1);
  224         return (-1);
  225 }
  226 
  227 static int
  228 fdisused(struct filedesc *fdp, int fd)
  229 {
  230 
  231         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
  232             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
  233 
  234         return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
  235 }
  236 
  237 /*
  238  * Mark a file descriptor as used.
  239  */
  240 static void
  241 fdused_init(struct filedesc *fdp, int fd)
  242 {
  243 
  244         KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
  245 
  246         fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
  247 }
  248 
  249 static void
  250 fdused(struct filedesc *fdp, int fd)
  251 {
  252 
  253         FILEDESC_XLOCK_ASSERT(fdp);
  254 
  255         fdused_init(fdp, fd);
  256         if (fd > fdp->fd_lastfile)
  257                 fdp->fd_lastfile = fd;
  258         if (fd == fdp->fd_freefile)
  259                 fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
  260 }
  261 
  262 /*
  263  * Mark a file descriptor as unused.
  264  */
  265 static void
  266 fdunused(struct filedesc *fdp, int fd)
  267 {
  268 
  269         FILEDESC_XLOCK_ASSERT(fdp);
  270 
  271         KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
  272         KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
  273             ("fd=%d is still in use", fd));
  274 
  275         fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
  276         if (fd < fdp->fd_freefile)
  277                 fdp->fd_freefile = fd;
  278         if (fd == fdp->fd_lastfile)
  279                 fdp->fd_lastfile = fd_last_used(fdp, fd);
  280 }
  281 
  282 /*
  283  * Free a file descriptor.
  284  *
  285  * Avoid some work if fdp is about to be destroyed.
  286  */
  287 static inline void
  288 fdefree_last(struct filedescent *fde)
  289 {
  290 
  291         filecaps_free(&fde->fde_caps);
  292 }
  293 
  294 static inline void
  295 fdfree(struct filedesc *fdp, int fd)
  296 {
  297         struct filedescent *fde;
  298 
  299         fde = &fdp->fd_ofiles[fd];
  300 #ifdef CAPABILITIES
  301         seq_write_begin(&fde->fde_seq);
  302 #endif
  303         fdefree_last(fde);
  304         fde->fde_file = NULL;
  305         fdunused(fdp, fd);
  306 #ifdef CAPABILITIES
  307         seq_write_end(&fde->fde_seq);
  308 #endif
  309 }
  310 
  311 void
  312 pwd_ensure_dirs(void)
  313 {
  314         struct filedesc *fdp;
  315 
  316         fdp = curproc->p_fd;
  317         FILEDESC_XLOCK(fdp);
  318         if (fdp->fd_cdir == NULL) {
  319                 fdp->fd_cdir = rootvnode;
  320                 vrefact(rootvnode);
  321         }
  322         if (fdp->fd_rdir == NULL) {
  323                 fdp->fd_rdir = rootvnode;
  324                 vrefact(rootvnode);
  325         }
  326         FILEDESC_XUNLOCK(fdp);
  327 }
  328 
  329 /*
  330  * System calls on descriptors.
  331  */
  332 #ifndef _SYS_SYSPROTO_H_
  333 struct getdtablesize_args {
  334         int     dummy;
  335 };
  336 #endif
  337 /* ARGSUSED */
  338 int
  339 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
  340 {
  341 #ifdef  RACCT
  342         uint64_t lim;
  343 #endif
  344 
  345         td->td_retval[0] =
  346             min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
  347 #ifdef  RACCT
  348         PROC_LOCK(td->td_proc);
  349         lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
  350         PROC_UNLOCK(td->td_proc);
  351         if (lim < td->td_retval[0])
  352                 td->td_retval[0] = lim;
  353 #endif
  354         return (0);
  355 }
  356 
  357 /*
  358  * Duplicate a file descriptor to a particular value.
  359  *
  360  * Note: keep in mind that a potential race condition exists when closing
  361  * descriptors from a shared descriptor table (via rfork).
  362  */
  363 #ifndef _SYS_SYSPROTO_H_
  364 struct dup2_args {
  365         u_int   from;
  366         u_int   to;
  367 };
  368 #endif
  369 /* ARGSUSED */
  370 int
  371 sys_dup2(struct thread *td, struct dup2_args *uap)
  372 {
  373 
  374         return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
  375 }
  376 
  377 /*
  378  * Duplicate a file descriptor.
  379  */
  380 #ifndef _SYS_SYSPROTO_H_
  381 struct dup_args {
  382         u_int   fd;
  383 };
  384 #endif
  385 /* ARGSUSED */
  386 int
  387 sys_dup(struct thread *td, struct dup_args *uap)
  388 {
  389 
  390         return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
  391 }
  392 
  393 /*
  394  * The file control system call.
  395  */
  396 #ifndef _SYS_SYSPROTO_H_
  397 struct fcntl_args {
  398         int     fd;
  399         int     cmd;
  400         long    arg;
  401 };
  402 #endif
  403 /* ARGSUSED */
  404 int
  405 sys_fcntl(struct thread *td, struct fcntl_args *uap)
  406 {
  407 
  408         return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
  409 }
  410 
  411 int
  412 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
  413 {
  414         struct flock fl;
  415         struct __oflock ofl;
  416         intptr_t arg1;
  417         int error, newcmd;
  418 
  419         error = 0;
  420         newcmd = cmd;
  421         switch (cmd) {
  422         case F_OGETLK:
  423         case F_OSETLK:
  424         case F_OSETLKW:
  425                 /*
  426                  * Convert old flock structure to new.
  427                  */
  428                 error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
  429                 fl.l_start = ofl.l_start;
  430                 fl.l_len = ofl.l_len;
  431                 fl.l_pid = ofl.l_pid;
  432                 fl.l_type = ofl.l_type;
  433                 fl.l_whence = ofl.l_whence;
  434                 fl.l_sysid = 0;
  435 
  436                 switch (cmd) {
  437                 case F_OGETLK:
  438                         newcmd = F_GETLK;
  439                         break;
  440                 case F_OSETLK:
  441                         newcmd = F_SETLK;
  442                         break;
  443                 case F_OSETLKW:
  444                         newcmd = F_SETLKW;
  445                         break;
  446                 }
  447                 arg1 = (intptr_t)&fl;
  448                 break;
  449         case F_GETLK:
  450         case F_SETLK:
  451         case F_SETLKW:
  452         case F_SETLK_REMOTE:
  453                 error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
  454                 arg1 = (intptr_t)&fl;
  455                 break;
  456         default:
  457                 arg1 = arg;
  458                 break;
  459         }
  460         if (error)
  461                 return (error);
  462         error = kern_fcntl(td, fd, newcmd, arg1);
  463         if (error)
  464                 return (error);
  465         if (cmd == F_OGETLK) {
  466                 ofl.l_start = fl.l_start;
  467                 ofl.l_len = fl.l_len;
  468                 ofl.l_pid = fl.l_pid;
  469                 ofl.l_type = fl.l_type;
  470                 ofl.l_whence = fl.l_whence;
  471                 error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
  472         } else if (cmd == F_GETLK) {
  473                 error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
  474         }
  475         return (error);
  476 }
  477 
  478 int
  479 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
  480 {
  481         struct filedesc *fdp;
  482         struct flock *flp;
  483         struct file *fp, *fp2;
  484         struct filedescent *fde;
  485         struct proc *p;
  486         struct vnode *vp;
  487         cap_rights_t rights;
  488         int error, flg, tmp;
  489         uint64_t bsize;
  490         off_t foffset;
  491 
  492         error = 0;
  493         flg = F_POSIX;
  494         p = td->td_proc;
  495         fdp = p->p_fd;
  496 
  497         switch (cmd) {
  498         case F_DUPFD:
  499                 tmp = arg;
  500                 error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
  501                 break;
  502 
  503         case F_DUPFD_CLOEXEC:
  504                 tmp = arg;
  505                 error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
  506                 break;
  507 
  508         case F_DUP2FD:
  509                 tmp = arg;
  510                 error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
  511                 break;
  512 
  513         case F_DUP2FD_CLOEXEC:
  514                 tmp = arg;
  515                 error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
  516                 break;
  517 
  518         case F_GETFD:
  519                 error = EBADF;
  520                 FILEDESC_SLOCK(fdp);
  521                 fde = fdeget_locked(fdp, fd);
  522                 if (fde != NULL) {
  523                         td->td_retval[0] =
  524                             (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
  525                         error = 0;
  526                 }
  527                 FILEDESC_SUNLOCK(fdp);
  528                 break;
  529 
  530         case F_SETFD:
  531                 error = EBADF;
  532                 FILEDESC_XLOCK(fdp);
  533                 fde = fdeget_locked(fdp, fd);
  534                 if (fde != NULL) {
  535                         fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
  536                             (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
  537                         error = 0;
  538                 }
  539                 FILEDESC_XUNLOCK(fdp);
  540                 break;
  541 
  542         case F_GETFL:
  543                 error = fget_fcntl(td, fd,
  544                     cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp);
  545                 if (error != 0)
  546                         break;
  547                 td->td_retval[0] = OFLAGS(fp->f_flag);
  548                 fdrop(fp, td);
  549                 break;
  550 
  551         case F_SETFL:
  552                 error = fget_fcntl(td, fd,
  553                     cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp);
  554                 if (error != 0)
  555                         break;
  556                 do {
  557                         tmp = flg = fp->f_flag;
  558                         tmp &= ~FCNTLFLAGS;
  559                         tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
  560                 } while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
  561                 tmp = fp->f_flag & FNONBLOCK;
  562                 error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  563                 if (error != 0) {
  564                         fdrop(fp, td);
  565                         break;
  566                 }
  567                 tmp = fp->f_flag & FASYNC;
  568                 error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
  569                 if (error == 0) {
  570                         fdrop(fp, td);
  571                         break;
  572                 }
  573                 atomic_clear_int(&fp->f_flag, FNONBLOCK);
  574                 tmp = 0;
  575                 (void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
  576                 fdrop(fp, td);
  577                 break;
  578 
  579         case F_GETOWN:
  580                 error = fget_fcntl(td, fd,
  581                     cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp);
  582                 if (error != 0)
  583                         break;
  584                 error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
  585                 if (error == 0)
  586                         td->td_retval[0] = tmp;
  587                 fdrop(fp, td);
  588                 break;
  589 
  590         case F_SETOWN:
  591                 error = fget_fcntl(td, fd,
  592                     cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp);
  593                 if (error != 0)
  594                         break;
  595                 tmp = arg;
  596                 error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
  597                 fdrop(fp, td);
  598                 break;
  599 
  600         case F_SETLK_REMOTE:
  601                 error = priv_check(td, PRIV_NFS_LOCKD);
  602                 if (error)
  603                         return (error);
  604                 flg = F_REMOTE;
  605                 goto do_setlk;
  606 
  607         case F_SETLKW:
  608                 flg |= F_WAIT;
  609                 /* FALLTHROUGH F_SETLK */
  610 
  611         case F_SETLK:
  612         do_setlk:
  613                 cap_rights_init(&rights, CAP_FLOCK);
  614                 error = fget_unlocked(fdp, fd, &rights, &fp, NULL);
  615                 if (error != 0)
  616                         break;
  617                 if (fp->f_type != DTYPE_VNODE) {
  618                         error = EBADF;
  619                         fdrop(fp, td);
  620                         break;
  621                 }
  622 
  623                 flp = (struct flock *)arg;
  624                 if (flp->l_whence == SEEK_CUR) {
  625                         foffset = foffset_get(fp);
  626                         if (foffset < 0 ||
  627                             (flp->l_start > 0 &&
  628                              foffset > OFF_MAX - flp->l_start)) {
  629                                 error = EOVERFLOW;
  630                                 fdrop(fp, td);
  631                                 break;
  632                         }
  633                         flp->l_start += foffset;
  634                 }
  635 
  636                 vp = fp->f_vnode;
  637                 switch (flp->l_type) {
  638                 case F_RDLCK:
  639                         if ((fp->f_flag & FREAD) == 0) {
  640                                 error = EBADF;
  641                                 break;
  642                         }
  643                         PROC_LOCK(p->p_leader);
  644                         p->p_leader->p_flag |= P_ADVLOCK;
  645                         PROC_UNLOCK(p->p_leader);
  646                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
  647                             flp, flg);
  648                         break;
  649                 case F_WRLCK:
  650                         if ((fp->f_flag & FWRITE) == 0) {
  651                                 error = EBADF;
  652                                 break;
  653                         }
  654                         PROC_LOCK(p->p_leader);
  655                         p->p_leader->p_flag |= P_ADVLOCK;
  656                         PROC_UNLOCK(p->p_leader);
  657                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
  658                             flp, flg);
  659                         break;
  660                 case F_UNLCK:
  661                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
  662                             flp, flg);
  663                         break;
  664                 case F_UNLCKSYS:
  665                         /*
  666                          * Temporary api for testing remote lock
  667                          * infrastructure.
  668                          */
  669                         if (flg != F_REMOTE) {
  670                                 error = EINVAL;
  671                                 break;
  672                         }
  673                         error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
  674                             F_UNLCKSYS, flp, flg);
  675                         break;
  676                 default:
  677                         error = EINVAL;
  678                         break;
  679                 }
  680                 if (error != 0 || flp->l_type == F_UNLCK ||
  681                     flp->l_type == F_UNLCKSYS) {
  682                         fdrop(fp, td);
  683                         break;
  684                 }
  685 
  686                 /*
  687                  * Check for a race with close.
  688                  *
  689                  * The vnode is now advisory locked (or unlocked, but this case
  690                  * is not really important) as the caller requested.
  691                  * We had to drop the filedesc lock, so we need to recheck if
  692                  * the descriptor is still valid, because if it was closed
  693                  * in the meantime we need to remove advisory lock from the
  694                  * vnode - close on any descriptor leading to an advisory
  695                  * locked vnode, removes that lock.
  696                  * We will return 0 on purpose in that case, as the result of
  697                  * successful advisory lock might have been externally visible
  698                  * already. This is fine - effectively we pretend to the caller
  699                  * that the closing thread was a bit slower and that the
  700                  * advisory lock succeeded before the close.
  701                  */
  702                 error = fget_unlocked(fdp, fd, &rights, &fp2, NULL);
  703                 if (error != 0) {
  704                         fdrop(fp, td);
  705                         break;
  706                 }
  707                 if (fp != fp2) {
  708                         flp->l_whence = SEEK_SET;
  709                         flp->l_start = 0;
  710                         flp->l_len = 0;
  711                         flp->l_type = F_UNLCK;
  712                         (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
  713                             F_UNLCK, flp, F_POSIX);
  714                 }
  715                 fdrop(fp, td);
  716                 fdrop(fp2, td);
  717                 break;
  718 
  719         case F_GETLK:
  720                 error = fget_unlocked(fdp, fd,
  721                     cap_rights_init(&rights, CAP_FLOCK), &fp, NULL);
  722                 if (error != 0)
  723                         break;
  724                 if (fp->f_type != DTYPE_VNODE) {
  725                         error = EBADF;
  726                         fdrop(fp, td);
  727                         break;
  728                 }
  729                 flp = (struct flock *)arg;
  730                 if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
  731                     flp->l_type != F_UNLCK) {
  732                         error = EINVAL;
  733                         fdrop(fp, td);
  734                         break;
  735                 }
  736                 if (flp->l_whence == SEEK_CUR) {
  737                         foffset = foffset_get(fp);
  738                         if ((flp->l_start > 0 &&
  739                             foffset > OFF_MAX - flp->l_start) ||
  740                             (flp->l_start < 0 &&
  741                             foffset < OFF_MIN - flp->l_start)) {
  742                                 error = EOVERFLOW;
  743                                 fdrop(fp, td);
  744                                 break;
  745                         }
  746                         flp->l_start += foffset;
  747                 }
  748                 vp = fp->f_vnode;
  749                 error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
  750                     F_POSIX);
  751                 fdrop(fp, td);
  752                 break;
  753 
  754         case F_RDAHEAD:
  755                 arg = arg ? 128 * 1024: 0;
  756                 /* FALLTHROUGH */
  757         case F_READAHEAD:
  758                 error = fget_unlocked(fdp, fd,
  759                     cap_rights_init(&rights), &fp, NULL);
  760                 if (error != 0)
  761                         break;
  762                 if (fp->f_type != DTYPE_VNODE) {
  763                         fdrop(fp, td);
  764                         error = EBADF;
  765                         break;
  766                 }
  767                 vp = fp->f_vnode;
  768                 if (vp->v_type != VREG) {
  769                         fdrop(fp, td);
  770                         error = ENOTTY;
  771                         break;
  772                 }
  773 
  774                 /*
  775                  * Exclusive lock synchronizes against f_seqcount reads and
  776                  * writes in sequential_heuristic().
  777                  */
  778                 error = vn_lock(vp, LK_EXCLUSIVE);
  779                 if (error != 0) {
  780                         fdrop(fp, td);
  781                         break;
  782                 }
  783                 if (arg >= 0) {
  784                         bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
  785                         fp->f_seqcount = (arg + bsize - 1) / bsize;
  786                         atomic_set_int(&fp->f_flag, FRDAHEAD);
  787                 } else {
  788                         atomic_clear_int(&fp->f_flag, FRDAHEAD);
  789                 }
  790                 VOP_UNLOCK(vp, 0);
  791                 fdrop(fp, td);
  792                 break;
  793 
  794         default:
  795                 error = EINVAL;
  796                 break;
  797         }
  798         return (error);
  799 }
  800 
  801 static int
  802 getmaxfd(struct thread *td)
  803 {
  804 
  805         return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
  806 }
  807 
  808 /*
  809  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  810  */
  811 int
  812 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
  813 {
  814         struct filedesc *fdp;
  815         struct filedescent *oldfde, *newfde;
  816         struct proc *p;
  817         struct file *delfp;
  818         int error, maxfd;
  819 
  820         p = td->td_proc;
  821         fdp = p->p_fd;
  822 
  823         MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
  824         MPASS(mode < FDDUP_LASTMODE);
  825 
  826         AUDIT_ARG_FD(old);
  827         /* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
  828 
  829         /*
  830          * Verify we have a valid descriptor to dup from and possibly to
  831          * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
  832          * return EINVAL when the new descriptor is out of bounds.
  833          */
  834         if (old < 0)
  835                 return (EBADF);
  836         if (new < 0)
  837                 return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
  838         maxfd = getmaxfd(td);
  839         if (new >= maxfd)
  840                 return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
  841 
  842         error = EBADF;
  843         FILEDESC_XLOCK(fdp);
  844         if (fget_locked(fdp, old) == NULL)
  845                 goto unlock;
  846         if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
  847                 td->td_retval[0] = new;
  848                 if (flags & FDDUP_FLAG_CLOEXEC)
  849                         fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
  850                 error = 0;
  851                 goto unlock;
  852         }
  853 
  854         /*
  855          * If the caller specified a file descriptor, make sure the file
  856          * table is large enough to hold it, and grab it.  Otherwise, just
  857          * allocate a new descriptor the usual way.
  858          */
  859         switch (mode) {
  860         case FDDUP_NORMAL:
  861         case FDDUP_FCNTL:
  862                 if ((error = fdalloc(td, new, &new)) != 0)
  863                         goto unlock;
  864                 break;
  865         case FDDUP_MUSTREPLACE:
  866                 /* Target file descriptor must exist. */
  867                 if (fget_locked(fdp, new) == NULL)
  868                         goto unlock;
  869                 break;
  870         case FDDUP_FIXED:
  871                 if (new >= fdp->fd_nfiles) {
  872                         /*
  873                          * The resource limits are here instead of e.g.
  874                          * fdalloc(), because the file descriptor table may be
  875                          * shared between processes, so we can't really use
  876                          * racct_add()/racct_sub().  Instead of counting the
  877                          * number of actually allocated descriptors, just put
  878                          * the limit on the size of the file descriptor table.
  879                          */
  880 #ifdef RACCT
  881                         if (racct_enable) {
  882                                 PROC_LOCK(p);
  883                                 error = racct_set(p, RACCT_NOFILE, new + 1);
  884                                 PROC_UNLOCK(p);
  885                                 if (error != 0) {
  886                                         error = EMFILE;
  887                                         goto unlock;
  888                                 }
  889                         }
  890 #endif
  891                         fdgrowtable_exp(fdp, new + 1);
  892                 }
  893                 if (!fdisused(fdp, new))
  894                         fdused(fdp, new);
  895                 break;
  896         default:
  897                 KASSERT(0, ("%s unsupported mode %d", __func__, mode));
  898         }
  899 
  900         KASSERT(old != new, ("new fd is same as old"));
  901 
  902         oldfde = &fdp->fd_ofiles[old];
  903         fhold(oldfde->fde_file);
  904         newfde = &fdp->fd_ofiles[new];
  905         delfp = newfde->fde_file;
  906 
  907         /*
  908          * Duplicate the source descriptor.
  909          */
  910 #ifdef CAPABILITIES
  911         seq_write_begin(&newfde->fde_seq);
  912 #endif
  913         filecaps_free(&newfde->fde_caps);
  914         memcpy(newfde, oldfde, fde_change_size);
  915         filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
  916         if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
  917                 newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
  918         else
  919                 newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
  920 #ifdef CAPABILITIES
  921         seq_write_end(&newfde->fde_seq);
  922 #endif
  923         td->td_retval[0] = new;
  924 
  925         error = 0;
  926 
  927         if (delfp != NULL) {
  928                 (void) closefp(fdp, new, delfp, td, 1);
  929                 FILEDESC_UNLOCK_ASSERT(fdp);
  930         } else {
  931 unlock:
  932                 FILEDESC_XUNLOCK(fdp);
  933         }
  934 
  935         return (error);
  936 }
  937 
  938 /*
  939  * If sigio is on the list associated with a process or process group,
  940  * disable signalling from the device, remove sigio from the list and
  941  * free sigio.
  942  */
  943 void
  944 funsetown(struct sigio **sigiop)
  945 {
  946         struct sigio *sigio;
  947 
  948         if (*sigiop == NULL)
  949                 return;
  950         SIGIO_LOCK();
  951         sigio = *sigiop;
  952         if (sigio == NULL) {
  953                 SIGIO_UNLOCK();
  954                 return;
  955         }
  956         *(sigio->sio_myref) = NULL;
  957         if ((sigio)->sio_pgid < 0) {
  958                 struct pgrp *pg = (sigio)->sio_pgrp;
  959                 PGRP_LOCK(pg);
  960                 SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
  961                             sigio, sio_pgsigio);
  962                 PGRP_UNLOCK(pg);
  963         } else {
  964                 struct proc *p = (sigio)->sio_proc;
  965                 PROC_LOCK(p);
  966                 SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
  967                             sigio, sio_pgsigio);
  968                 PROC_UNLOCK(p);
  969         }
  970         SIGIO_UNLOCK();
  971         crfree(sigio->sio_ucred);
  972         free(sigio, M_SIGIO);
  973 }
  974 
  975 /*
  976  * Free a list of sigio structures.
  977  * We only need to lock the SIGIO_LOCK because we have made ourselves
  978  * inaccessible to callers of fsetown and therefore do not need to lock
  979  * the proc or pgrp struct for the list manipulation.
  980  */
  981 void
  982 funsetownlst(struct sigiolst *sigiolst)
  983 {
  984         struct proc *p;
  985         struct pgrp *pg;
  986         struct sigio *sigio;
  987 
  988         sigio = SLIST_FIRST(sigiolst);
  989         if (sigio == NULL)
  990                 return;
  991         p = NULL;
  992         pg = NULL;
  993 
  994         /*
  995          * Every entry of the list should belong
  996          * to a single proc or pgrp.
  997          */
  998         if (sigio->sio_pgid < 0) {
  999                 pg = sigio->sio_pgrp;
 1000                 PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 1001         } else /* if (sigio->sio_pgid > 0) */ {
 1002                 p = sigio->sio_proc;
 1003                 PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 1004         }
 1005 
 1006         SIGIO_LOCK();
 1007         while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 1008                 *(sigio->sio_myref) = NULL;
 1009                 if (pg != NULL) {
 1010                         KASSERT(sigio->sio_pgid < 0,
 1011                             ("Proc sigio in pgrp sigio list"));
 1012                         KASSERT(sigio->sio_pgrp == pg,
 1013                             ("Bogus pgrp in sigio list"));
 1014                         PGRP_LOCK(pg);
 1015                         SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 1016                             sio_pgsigio);
 1017                         PGRP_UNLOCK(pg);
 1018                 } else /* if (p != NULL) */ {
 1019                         KASSERT(sigio->sio_pgid > 0,
 1020                             ("Pgrp sigio in proc sigio list"));
 1021                         KASSERT(sigio->sio_proc == p,
 1022                             ("Bogus proc in sigio list"));
 1023                         PROC_LOCK(p);
 1024                         SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 1025                             sio_pgsigio);
 1026                         PROC_UNLOCK(p);
 1027                 }
 1028                 SIGIO_UNLOCK();
 1029                 crfree(sigio->sio_ucred);
 1030                 free(sigio, M_SIGIO);
 1031                 SIGIO_LOCK();
 1032         }
 1033         SIGIO_UNLOCK();
 1034 }
 1035 
 1036 /*
 1037  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
 1038  *
 1039  * After permission checking, add a sigio structure to the sigio list for
 1040  * the process or process group.
 1041  */
 1042 int
 1043 fsetown(pid_t pgid, struct sigio **sigiop)
 1044 {
 1045         struct proc *proc;
 1046         struct pgrp *pgrp;
 1047         struct sigio *sigio;
 1048         int ret;
 1049 
 1050         if (pgid == 0) {
 1051                 funsetown(sigiop);
 1052                 return (0);
 1053         }
 1054 
 1055         ret = 0;
 1056 
 1057         /* Allocate and fill in the new sigio out of locks. */
 1058         sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 1059         sigio->sio_pgid = pgid;
 1060         sigio->sio_ucred = crhold(curthread->td_ucred);
 1061         sigio->sio_myref = sigiop;
 1062 
 1063         sx_slock(&proctree_lock);
 1064         if (pgid > 0) {
 1065                 proc = pfind(pgid);
 1066                 if (proc == NULL) {
 1067                         ret = ESRCH;
 1068                         goto fail;
 1069                 }
 1070 
 1071                 /*
 1072                  * Policy - Don't allow a process to FSETOWN a process
 1073                  * in another session.
 1074                  *
 1075                  * Remove this test to allow maximum flexibility or
 1076                  * restrict FSETOWN to the current process or process
 1077                  * group for maximum safety.
 1078                  */
 1079                 PROC_UNLOCK(proc);
 1080                 if (proc->p_session != curthread->td_proc->p_session) {
 1081                         ret = EPERM;
 1082                         goto fail;
 1083                 }
 1084 
 1085                 pgrp = NULL;
 1086         } else /* if (pgid < 0) */ {
 1087                 pgrp = pgfind(-pgid);
 1088                 if (pgrp == NULL) {
 1089                         ret = ESRCH;
 1090                         goto fail;
 1091                 }
 1092                 PGRP_UNLOCK(pgrp);
 1093 
 1094                 /*
 1095                  * Policy - Don't allow a process to FSETOWN a process
 1096                  * in another session.
 1097                  *
 1098                  * Remove this test to allow maximum flexibility or
 1099                  * restrict FSETOWN to the current process or process
 1100                  * group for maximum safety.
 1101                  */
 1102                 if (pgrp->pg_session != curthread->td_proc->p_session) {
 1103                         ret = EPERM;
 1104                         goto fail;
 1105                 }
 1106 
 1107                 proc = NULL;
 1108         }
 1109         funsetown(sigiop);
 1110         if (pgid > 0) {
 1111                 PROC_LOCK(proc);
 1112                 /*
 1113                  * Since funsetownlst() is called without the proctree
 1114                  * locked, we need to check for P_WEXIT.
 1115                  * XXX: is ESRCH correct?
 1116                  */
 1117                 if ((proc->p_flag & P_WEXIT) != 0) {
 1118                         PROC_UNLOCK(proc);
 1119                         ret = ESRCH;
 1120                         goto fail;
 1121                 }
 1122                 SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 1123                 sigio->sio_proc = proc;
 1124                 PROC_UNLOCK(proc);
 1125         } else {
 1126                 PGRP_LOCK(pgrp);
 1127                 SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 1128                 sigio->sio_pgrp = pgrp;
 1129                 PGRP_UNLOCK(pgrp);
 1130         }
 1131         sx_sunlock(&proctree_lock);
 1132         SIGIO_LOCK();
 1133         *sigiop = sigio;
 1134         SIGIO_UNLOCK();
 1135         return (0);
 1136 
 1137 fail:
 1138         sx_sunlock(&proctree_lock);
 1139         crfree(sigio->sio_ucred);
 1140         free(sigio, M_SIGIO);
 1141         return (ret);
 1142 }
 1143 
 1144 /*
 1145  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
 1146  */
 1147 pid_t
 1148 fgetown(sigiop)
 1149         struct sigio **sigiop;
 1150 {
 1151         pid_t pgid;
 1152 
 1153         SIGIO_LOCK();
 1154         pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 1155         SIGIO_UNLOCK();
 1156         return (pgid);
 1157 }
 1158 
 1159 /*
 1160  * Function drops the filedesc lock on return.
 1161  */
 1162 static int
 1163 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
 1164     int holdleaders)
 1165 {
 1166         int error;
 1167 
 1168         FILEDESC_XLOCK_ASSERT(fdp);
 1169 
 1170         if (holdleaders) {
 1171                 if (td->td_proc->p_fdtol != NULL) {
 1172                         /*
 1173                          * Ask fdfree() to sleep to ensure that all relevant
 1174                          * process leaders can be traversed in closef().
 1175                          */
 1176                         fdp->fd_holdleaderscount++;
 1177                 } else {
 1178                         holdleaders = 0;
 1179                 }
 1180         }
 1181 
 1182         /*
 1183          * We now hold the fp reference that used to be owned by the
 1184          * descriptor array.  We have to unlock the FILEDESC *AFTER*
 1185          * knote_fdclose to prevent a race of the fd getting opened, a knote
 1186          * added, and deleteing a knote for the new fd.
 1187          */
 1188         knote_fdclose(td, fd);
 1189 
 1190         /*
 1191          * We need to notify mqueue if the object is of type mqueue.
 1192          */
 1193         if (fp->f_type == DTYPE_MQUEUE)
 1194                 mq_fdclose(td, fd, fp);
 1195         FILEDESC_XUNLOCK(fdp);
 1196 
 1197         error = closef(fp, td);
 1198         if (holdleaders) {
 1199                 FILEDESC_XLOCK(fdp);
 1200                 fdp->fd_holdleaderscount--;
 1201                 if (fdp->fd_holdleaderscount == 0 &&
 1202                     fdp->fd_holdleaderswakeup != 0) {
 1203                         fdp->fd_holdleaderswakeup = 0;
 1204                         wakeup(&fdp->fd_holdleaderscount);
 1205                 }
 1206                 FILEDESC_XUNLOCK(fdp);
 1207         }
 1208         return (error);
 1209 }
 1210 
 1211 /*
 1212  * Close a file descriptor.
 1213  */
 1214 #ifndef _SYS_SYSPROTO_H_
 1215 struct close_args {
 1216         int     fd;
 1217 };
 1218 #endif
 1219 /* ARGSUSED */
 1220 int
 1221 sys_close(struct thread *td, struct close_args *uap)
 1222 {
 1223 
 1224         return (kern_close(td, uap->fd));
 1225 }
 1226 
 1227 int
 1228 kern_close(struct thread *td, int fd)
 1229 {
 1230         struct filedesc *fdp;
 1231         struct file *fp;
 1232 
 1233         fdp = td->td_proc->p_fd;
 1234 
 1235         AUDIT_SYSCLOSE(td, fd);
 1236 
 1237         FILEDESC_XLOCK(fdp);
 1238         if ((fp = fget_locked(fdp, fd)) == NULL) {
 1239                 FILEDESC_XUNLOCK(fdp);
 1240                 return (EBADF);
 1241         }
 1242         fdfree(fdp, fd);
 1243 
 1244         /* closefp() drops the FILEDESC lock for us. */
 1245         return (closefp(fdp, fd, fp, td, 1));
 1246 }
 1247 
 1248 /*
 1249  * Close open file descriptors.
 1250  */
 1251 #ifndef _SYS_SYSPROTO_H_
 1252 struct closefrom_args {
 1253         int     lowfd;
 1254 };
 1255 #endif
 1256 /* ARGSUSED */
 1257 int
 1258 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 1259 {
 1260         struct filedesc *fdp;
 1261         int fd;
 1262 
 1263         fdp = td->td_proc->p_fd;
 1264         AUDIT_ARG_FD(uap->lowfd);
 1265 
 1266         /*
 1267          * Treat negative starting file descriptor values identical to
 1268          * closefrom(0) which closes all files.
 1269          */
 1270         if (uap->lowfd < 0)
 1271                 uap->lowfd = 0;
 1272         FILEDESC_SLOCK(fdp);
 1273         for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 1274                 if (fdp->fd_ofiles[fd].fde_file != NULL) {
 1275                         FILEDESC_SUNLOCK(fdp);
 1276                         (void)kern_close(td, fd);
 1277                         FILEDESC_SLOCK(fdp);
 1278                 }
 1279         }
 1280         FILEDESC_SUNLOCK(fdp);
 1281         return (0);
 1282 }
 1283 
 1284 #if defined(COMPAT_43)
 1285 /*
 1286  * Return status information about a file descriptor.
 1287  */
 1288 #ifndef _SYS_SYSPROTO_H_
 1289 struct ofstat_args {
 1290         int     fd;
 1291         struct  ostat *sb;
 1292 };
 1293 #endif
 1294 /* ARGSUSED */
 1295 int
 1296 ofstat(struct thread *td, struct ofstat_args *uap)
 1297 {
 1298         struct ostat oub;
 1299         struct stat ub;
 1300         int error;
 1301 
 1302         error = kern_fstat(td, uap->fd, &ub);
 1303         if (error == 0) {
 1304                 cvtstat(&ub, &oub);
 1305                 error = copyout(&oub, uap->sb, sizeof(oub));
 1306         }
 1307         return (error);
 1308 }
 1309 #endif /* COMPAT_43 */
 1310 
 1311 /*
 1312  * Return status information about a file descriptor.
 1313  */
 1314 #ifndef _SYS_SYSPROTO_H_
 1315 struct fstat_args {
 1316         int     fd;
 1317         struct  stat *sb;
 1318 };
 1319 #endif
 1320 /* ARGSUSED */
 1321 int
 1322 sys_fstat(struct thread *td, struct fstat_args *uap)
 1323 {
 1324         struct stat ub;
 1325         int error;
 1326 
 1327         error = kern_fstat(td, uap->fd, &ub);
 1328         if (error == 0)
 1329                 error = copyout(&ub, uap->sb, sizeof(ub));
 1330         return (error);
 1331 }
 1332 
 1333 int
 1334 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 1335 {
 1336         struct file *fp;
 1337         cap_rights_t rights;
 1338         int error;
 1339 
 1340         AUDIT_ARG_FD(fd);
 1341 
 1342         error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 1343         if (error != 0)
 1344                 return (error);
 1345 
 1346         AUDIT_ARG_FILE(td->td_proc, fp);
 1347 
 1348         error = fo_stat(fp, sbp, td->td_ucred, td);
 1349         fdrop(fp, td);
 1350 #ifdef KTRACE
 1351         if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 1352                 ktrstat(sbp);
 1353 #endif
 1354         return (error);
 1355 }
 1356 
 1357 /*
 1358  * Return status information about a file descriptor.
 1359  */
 1360 #ifndef _SYS_SYSPROTO_H_
 1361 struct nfstat_args {
 1362         int     fd;
 1363         struct  nstat *sb;
 1364 };
 1365 #endif
 1366 /* ARGSUSED */
 1367 int
 1368 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 1369 {
 1370         struct nstat nub;
 1371         struct stat ub;
 1372         int error;
 1373 
 1374         error = kern_fstat(td, uap->fd, &ub);
 1375         if (error == 0) {
 1376                 cvtnstat(&ub, &nub);
 1377                 error = copyout(&nub, uap->sb, sizeof(nub));
 1378         }
 1379         return (error);
 1380 }
 1381 
 1382 /*
 1383  * Return pathconf information about a file descriptor.
 1384  */
 1385 #ifndef _SYS_SYSPROTO_H_
 1386 struct fpathconf_args {
 1387         int     fd;
 1388         int     name;
 1389 };
 1390 #endif
 1391 /* ARGSUSED */
 1392 int
 1393 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 1394 {
 1395 
 1396         return (kern_fpathconf(td, uap->fd, uap->name));
 1397 }
 1398 
 1399 int
 1400 kern_fpathconf(struct thread *td, int fd, int name)
 1401 {
 1402         struct file *fp;
 1403         struct vnode *vp;
 1404         cap_rights_t rights;
 1405         int error;
 1406 
 1407         error = fget(td, fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 1408         if (error != 0)
 1409                 return (error);
 1410 
 1411         if (name == _PC_ASYNC_IO) {
 1412                 td->td_retval[0] = _POSIX_ASYNCHRONOUS_IO;
 1413                 goto out;
 1414         }
 1415         vp = fp->f_vnode;
 1416         if (vp != NULL) {
 1417                 vn_lock(vp, LK_SHARED | LK_RETRY);
 1418                 error = VOP_PATHCONF(vp, name, td->td_retval);
 1419                 VOP_UNLOCK(vp, 0);
 1420         } else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 1421                 if (name != _PC_PIPE_BUF) {
 1422                         error = EINVAL;
 1423                 } else {
 1424                         td->td_retval[0] = PIPE_BUF;
 1425                         error = 0;
 1426                 }
 1427         } else {
 1428                 error = EOPNOTSUPP;
 1429         }
 1430 out:
 1431         fdrop(fp, td);
 1432         return (error);
 1433 }
 1434 
 1435 /*
 1436  * Initialize filecaps structure.
 1437  */
 1438 void
 1439 filecaps_init(struct filecaps *fcaps)
 1440 {
 1441 
 1442         bzero(fcaps, sizeof(*fcaps));
 1443         fcaps->fc_nioctls = -1;
 1444 }
 1445 
 1446 /*
 1447  * Copy filecaps structure allocating memory for ioctls array if needed.
 1448  *
 1449  * The last parameter indicates whether the fdtable is locked. If it is not and
 1450  * ioctls are encountered, copying fails and the caller must lock the table.
 1451  *
 1452  * Note that if the table was not locked, the caller has to check the relevant
 1453  * sequence counter to determine whether the operation was successful.
 1454  */
 1455 bool
 1456 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 1457 {
 1458         size_t size;
 1459 
 1460         if (src->fc_ioctls != NULL && !locked)
 1461                 return (false);
 1462         *dst = *src;
 1463         if (src->fc_ioctls == NULL)
 1464                 return (true);
 1465 
 1466         KASSERT(src->fc_nioctls > 0,
 1467             ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 1468 
 1469         size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 1470         dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 1471         bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 1472         return (true);
 1473 }
 1474 
 1475 /*
 1476  * Move filecaps structure to the new place and clear the old place.
 1477  */
 1478 void
 1479 filecaps_move(struct filecaps *src, struct filecaps *dst)
 1480 {
 1481 
 1482         *dst = *src;
 1483         bzero(src, sizeof(*src));
 1484 }
 1485 
 1486 /*
 1487  * Fill the given filecaps structure with full rights.
 1488  */
 1489 static void
 1490 filecaps_fill(struct filecaps *fcaps)
 1491 {
 1492 
 1493         CAP_ALL(&fcaps->fc_rights);
 1494         fcaps->fc_ioctls = NULL;
 1495         fcaps->fc_nioctls = -1;
 1496         fcaps->fc_fcntls = CAP_FCNTL_ALL;
 1497 }
 1498 
 1499 /*
 1500  * Free memory allocated within filecaps structure.
 1501  */
 1502 void
 1503 filecaps_free(struct filecaps *fcaps)
 1504 {
 1505 
 1506         free(fcaps->fc_ioctls, M_FILECAPS);
 1507         bzero(fcaps, sizeof(*fcaps));
 1508 }
 1509 
 1510 /*
 1511  * Validate the given filecaps structure.
 1512  */
 1513 static void
 1514 filecaps_validate(const struct filecaps *fcaps, const char *func)
 1515 {
 1516 
 1517         KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 1518             ("%s: invalid rights", func));
 1519         KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 1520             ("%s: invalid fcntls", func));
 1521         KASSERT(fcaps->fc_fcntls == 0 ||
 1522             cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 1523             ("%s: fcntls without CAP_FCNTL", func));
 1524         KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 1525             (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 1526             ("%s: invalid ioctls", func));
 1527         KASSERT(fcaps->fc_nioctls == 0 ||
 1528             cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 1529             ("%s: ioctls without CAP_IOCTL", func));
 1530 }
 1531 
 1532 static void
 1533 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 1534 {
 1535         int nfd1;
 1536 
 1537         FILEDESC_XLOCK_ASSERT(fdp);
 1538 
 1539         nfd1 = fdp->fd_nfiles * 2;
 1540         if (nfd1 < nfd)
 1541                 nfd1 = nfd;
 1542         fdgrowtable(fdp, nfd1);
 1543 }
 1544 
 1545 /*
 1546  * Grow the file table to accommodate (at least) nfd descriptors.
 1547  */
 1548 static void
 1549 fdgrowtable(struct filedesc *fdp, int nfd)
 1550 {
 1551         struct filedesc0 *fdp0;
 1552         struct freetable *ft;
 1553         struct fdescenttbl *ntable;
 1554         struct fdescenttbl *otable;
 1555         int nnfiles, onfiles;
 1556         NDSLOTTYPE *nmap, *omap;
 1557 
 1558         /*
 1559          * If lastfile is -1 this struct filedesc was just allocated and we are
 1560          * growing it to accommodate for the one we are going to copy from. There
 1561          * is no need to have a lock on this one as it's not visible to anyone.
 1562          */
 1563         if (fdp->fd_lastfile != -1)
 1564                 FILEDESC_XLOCK_ASSERT(fdp);
 1565 
 1566         KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 1567 
 1568         /* save old values */
 1569         onfiles = fdp->fd_nfiles;
 1570         otable = fdp->fd_files;
 1571         omap = fdp->fd_map;
 1572 
 1573         /* compute the size of the new table */
 1574         nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 1575         if (nnfiles <= onfiles)
 1576                 /* the table is already large enough */
 1577                 return;
 1578 
 1579         /*
 1580          * Allocate a new table.  We need enough space for the number of
 1581          * entries, file entries themselves and the struct freetable we will use
 1582          * when we decommission the table and place it on the freelist.
 1583          * We place the struct freetable in the middle so we don't have
 1584          * to worry about padding.
 1585          */
 1586         ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 1587             nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 1588             sizeof(struct freetable),
 1589             M_FILEDESC, M_ZERO | M_WAITOK);
 1590         /* copy the old data */
 1591         ntable->fdt_nfiles = nnfiles;
 1592         memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 1593             onfiles * sizeof(ntable->fdt_ofiles[0]));
 1594 
 1595         /*
 1596          * Allocate a new map only if the old is not large enough.  It will
 1597          * grow at a slower rate than the table as it can map more
 1598          * entries than the table can hold.
 1599          */
 1600         if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 1601                 nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 1602                     M_ZERO | M_WAITOK);
 1603                 /* copy over the old data and update the pointer */
 1604                 memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 1605                 fdp->fd_map = nmap;
 1606         }
 1607 
 1608         /*
 1609          * Make sure that ntable is correctly initialized before we replace
 1610          * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 1611          * data.
 1612          */
 1613         atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 1614 
 1615         /*
 1616          * Do not free the old file table, as some threads may still
 1617          * reference entries within it.  Instead, place it on a freelist
 1618          * which will be processed when the struct filedesc is released.
 1619          *
 1620          * Note that if onfiles == NDFILE, we're dealing with the original
 1621          * static allocation contained within (struct filedesc0 *)fdp,
 1622          * which must not be freed.
 1623          */
 1624         if (onfiles > NDFILE) {
 1625                 ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 1626                 fdp0 = (struct filedesc0 *)fdp;
 1627                 ft->ft_table = otable;
 1628                 SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 1629         }
 1630         /*
 1631          * The map does not have the same possibility of threads still
 1632          * holding references to it.  So always free it as long as it
 1633          * does not reference the original static allocation.
 1634          */
 1635         if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 1636                 free(omap, M_FILEDESC);
 1637 }
 1638 
 1639 /*
 1640  * Allocate a file descriptor for the process.
 1641  */
 1642 int
 1643 fdalloc(struct thread *td, int minfd, int *result)
 1644 {
 1645         struct proc *p = td->td_proc;
 1646         struct filedesc *fdp = p->p_fd;
 1647         int fd, maxfd, allocfd;
 1648 #ifdef RACCT
 1649         int error;
 1650 #endif
 1651 
 1652         FILEDESC_XLOCK_ASSERT(fdp);
 1653 
 1654         if (fdp->fd_freefile > minfd)
 1655                 minfd = fdp->fd_freefile;
 1656 
 1657         maxfd = getmaxfd(td);
 1658 
 1659         /*
 1660          * Search the bitmap for a free descriptor starting at minfd.
 1661          * If none is found, grow the file table.
 1662          */
 1663         fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 1664         if (fd >= maxfd)
 1665                 return (EMFILE);
 1666         if (fd >= fdp->fd_nfiles) {
 1667                 allocfd = min(fd * 2, maxfd);
 1668 #ifdef RACCT
 1669                 if (racct_enable) {
 1670                         PROC_LOCK(p);
 1671                         error = racct_set(p, RACCT_NOFILE, allocfd);
 1672                         PROC_UNLOCK(p);
 1673                         if (error != 0)
 1674                                 return (EMFILE);
 1675                 }
 1676 #endif
 1677                 /*
 1678                  * fd is already equal to first free descriptor >= minfd, so
 1679                  * we only need to grow the table and we are done.
 1680                  */
 1681                 fdgrowtable_exp(fdp, allocfd);
 1682         }
 1683 
 1684         /*
 1685          * Perform some sanity checks, then mark the file descriptor as
 1686          * used and return it to the caller.
 1687          */
 1688         KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 1689             ("invalid descriptor %d", fd));
 1690         KASSERT(!fdisused(fdp, fd),
 1691             ("fd_first_free() returned non-free descriptor"));
 1692         KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 1693             ("file descriptor isn't free"));
 1694         fdused(fdp, fd);
 1695         *result = fd;
 1696         return (0);
 1697 }
 1698 
 1699 /*
 1700  * Allocate n file descriptors for the process.
 1701  */
 1702 int
 1703 fdallocn(struct thread *td, int minfd, int *fds, int n)
 1704 {
 1705         struct proc *p = td->td_proc;
 1706         struct filedesc *fdp = p->p_fd;
 1707         int i;
 1708 
 1709         FILEDESC_XLOCK_ASSERT(fdp);
 1710 
 1711         for (i = 0; i < n; i++)
 1712                 if (fdalloc(td, 0, &fds[i]) != 0)
 1713                         break;
 1714 
 1715         if (i < n) {
 1716                 for (i--; i >= 0; i--)
 1717                         fdunused(fdp, fds[i]);
 1718                 return (EMFILE);
 1719         }
 1720 
 1721         return (0);
 1722 }
 1723 
 1724 /*
 1725  * Create a new open file structure and allocate a file descriptor for the
 1726  * process that refers to it.  We add one reference to the file for the
 1727  * descriptor table and one reference for resultfp. This is to prevent us
 1728  * being preempted and the entry in the descriptor table closed after we
 1729  * release the FILEDESC lock.
 1730  */
 1731 int
 1732 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
 1733     struct filecaps *fcaps)
 1734 {
 1735         struct file *fp;
 1736         int error, fd;
 1737 
 1738         error = falloc_noinstall(td, &fp);
 1739         if (error)
 1740                 return (error);         /* no reference held on error */
 1741 
 1742         error = finstall(td, fp, &fd, flags, fcaps);
 1743         if (error) {
 1744                 fdrop(fp, td);          /* one reference (fp only) */
 1745                 return (error);
 1746         }
 1747 
 1748         if (resultfp != NULL)
 1749                 *resultfp = fp;         /* copy out result */
 1750         else
 1751                 fdrop(fp, td);          /* release local reference */
 1752 
 1753         if (resultfd != NULL)
 1754                 *resultfd = fd;
 1755 
 1756         return (0);
 1757 }
 1758 
 1759 /*
 1760  * Create a new open file structure without allocating a file descriptor.
 1761  */
 1762 int
 1763 falloc_noinstall(struct thread *td, struct file **resultfp)
 1764 {
 1765         struct file *fp;
 1766         int maxuserfiles = maxfiles - (maxfiles / 20);
 1767         int openfiles_new;
 1768         static struct timeval lastfail;
 1769         static int curfail;
 1770 
 1771         KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 1772 
 1773         openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 1774         if ((openfiles_new >= maxuserfiles &&
 1775             priv_check(td, PRIV_MAXFILES) != 0) ||
 1776             openfiles_new >= maxfiles) {
 1777                 atomic_subtract_int(&openfiles, 1);
 1778                 if (ppsratecheck(&lastfail, &curfail, 1)) {
 1779                         printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 1780                             "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 1781                 }
 1782                 return (ENFILE);
 1783         }
 1784         fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 1785         refcount_init(&fp->f_count, 1);
 1786         fp->f_cred = crhold(td->td_ucred);
 1787         fp->f_ops = &badfileops;
 1788         *resultfp = fp;
 1789         return (0);
 1790 }
 1791 
 1792 /*
 1793  * Install a file in a file descriptor table.
 1794  */
 1795 void
 1796 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 1797     struct filecaps *fcaps)
 1798 {
 1799         struct filedescent *fde;
 1800 
 1801         MPASS(fp != NULL);
 1802         if (fcaps != NULL)
 1803                 filecaps_validate(fcaps, __func__);
 1804         FILEDESC_XLOCK_ASSERT(fdp);
 1805 
 1806         fde = &fdp->fd_ofiles[fd];
 1807 #ifdef CAPABILITIES
 1808         seq_write_begin(&fde->fde_seq);
 1809 #endif
 1810         fde->fde_file = fp;
 1811         fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 1812         if (fcaps != NULL)
 1813                 filecaps_move(fcaps, &fde->fde_caps);
 1814         else
 1815                 filecaps_fill(&fde->fde_caps);
 1816 #ifdef CAPABILITIES
 1817         seq_write_end(&fde->fde_seq);
 1818 #endif
 1819 }
 1820 
 1821 int
 1822 finstall(struct thread *td, struct file *fp, int *fd, int flags,
 1823     struct filecaps *fcaps)
 1824 {
 1825         struct filedesc *fdp = td->td_proc->p_fd;
 1826         int error;
 1827 
 1828         MPASS(fd != NULL);
 1829 
 1830         FILEDESC_XLOCK(fdp);
 1831         if ((error = fdalloc(td, 0, fd))) {
 1832                 FILEDESC_XUNLOCK(fdp);
 1833                 return (error);
 1834         }
 1835         fhold(fp);
 1836         _finstall(fdp, fp, *fd, flags, fcaps);
 1837         FILEDESC_XUNLOCK(fdp);
 1838         return (0);
 1839 }
 1840 
 1841 /*
 1842  * Build a new filedesc structure from another.
 1843  * Copy the current, root, and jail root vnode references.
 1844  *
 1845  * If fdp is not NULL, return with it shared locked.
 1846  */
 1847 struct filedesc *
 1848 fdinit(struct filedesc *fdp, bool prepfiles)
 1849 {
 1850         struct filedesc0 *newfdp0;
 1851         struct filedesc *newfdp;
 1852 
 1853         newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 1854         newfdp = &newfdp0->fd_fd;
 1855 
 1856         /* Create the file descriptor table. */
 1857         FILEDESC_LOCK_INIT(newfdp);
 1858         refcount_init(&newfdp->fd_refcnt, 1);
 1859         refcount_init(&newfdp->fd_holdcnt, 1);
 1860         newfdp->fd_cmask = CMASK;
 1861         newfdp->fd_map = newfdp0->fd_dmap;
 1862         newfdp->fd_lastfile = -1;
 1863         newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 1864         newfdp->fd_files->fdt_nfiles = NDFILE;
 1865 
 1866         if (fdp == NULL)
 1867                 return (newfdp);
 1868 
 1869         if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
 1870                 fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 1871 
 1872         FILEDESC_SLOCK(fdp);
 1873         newfdp->fd_cdir = fdp->fd_cdir;
 1874         if (newfdp->fd_cdir)
 1875                 vrefact(newfdp->fd_cdir);
 1876         newfdp->fd_rdir = fdp->fd_rdir;
 1877         if (newfdp->fd_rdir)
 1878                 vrefact(newfdp->fd_rdir);
 1879         newfdp->fd_jdir = fdp->fd_jdir;
 1880         if (newfdp->fd_jdir)
 1881                 vrefact(newfdp->fd_jdir);
 1882 
 1883         if (!prepfiles) {
 1884                 FILEDESC_SUNLOCK(fdp);
 1885         } else {
 1886                 while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 1887                         FILEDESC_SUNLOCK(fdp);
 1888                         fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 1889                         FILEDESC_SLOCK(fdp);
 1890                 }
 1891         }
 1892 
 1893         return (newfdp);
 1894 }
 1895 
 1896 static struct filedesc *
 1897 fdhold(struct proc *p)
 1898 {
 1899         struct filedesc *fdp;
 1900 
 1901         PROC_LOCK_ASSERT(p, MA_OWNED);
 1902         fdp = p->p_fd;
 1903         if (fdp != NULL)
 1904                 refcount_acquire(&fdp->fd_holdcnt);
 1905         return (fdp);
 1906 }
 1907 
 1908 static void
 1909 fddrop(struct filedesc *fdp)
 1910 {
 1911 
 1912         if (fdp->fd_holdcnt > 1) {
 1913                 if (refcount_release(&fdp->fd_holdcnt) == 0)
 1914                         return;
 1915         }
 1916 
 1917         FILEDESC_LOCK_DESTROY(fdp);
 1918         uma_zfree(filedesc0_zone, fdp);
 1919 }
 1920 
 1921 /*
 1922  * Share a filedesc structure.
 1923  */
 1924 struct filedesc *
 1925 fdshare(struct filedesc *fdp)
 1926 {
 1927 
 1928         refcount_acquire(&fdp->fd_refcnt);
 1929         return (fdp);
 1930 }
 1931 
 1932 /*
 1933  * Unshare a filedesc structure, if necessary by making a copy
 1934  */
 1935 void
 1936 fdunshare(struct thread *td)
 1937 {
 1938         struct filedesc *tmp;
 1939         struct proc *p = td->td_proc;
 1940 
 1941         if (p->p_fd->fd_refcnt == 1)
 1942                 return;
 1943 
 1944         tmp = fdcopy(p->p_fd);
 1945         fdescfree(td);
 1946         p->p_fd = tmp;
 1947 }
 1948 
 1949 void
 1950 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 1951 {
 1952 
 1953         fdescfree(td);
 1954         td->td_proc->p_fd = fdp;
 1955 }
 1956 
 1957 /*
 1958  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
 1959  * this is to ease callers, not catch errors.
 1960  */
 1961 struct filedesc *
 1962 fdcopy(struct filedesc *fdp)
 1963 {
 1964         struct filedesc *newfdp;
 1965         struct filedescent *nfde, *ofde;
 1966         int i;
 1967 
 1968         MPASS(fdp != NULL);
 1969 
 1970         newfdp = fdinit(fdp, true);
 1971         /* copy all passable descriptors (i.e. not kqueue) */
 1972         newfdp->fd_freefile = -1;
 1973         for (i = 0; i <= fdp->fd_lastfile; ++i) {
 1974                 ofde = &fdp->fd_ofiles[i];
 1975                 if (ofde->fde_file == NULL ||
 1976                     (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 1977                         if (newfdp->fd_freefile == -1)
 1978                                 newfdp->fd_freefile = i;
 1979                         continue;
 1980                 }
 1981                 nfde = &newfdp->fd_ofiles[i];
 1982                 *nfde = *ofde;
 1983                 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 1984                 fhold(nfde->fde_file);
 1985                 fdused_init(newfdp, i);
 1986                 newfdp->fd_lastfile = i;
 1987         }
 1988         if (newfdp->fd_freefile == -1)
 1989                 newfdp->fd_freefile = i;
 1990         newfdp->fd_cmask = fdp->fd_cmask;
 1991         FILEDESC_SUNLOCK(fdp);
 1992         return (newfdp);
 1993 }
 1994 
 1995 /*
 1996  * Copies a filedesc structure, while remapping all file descriptors
 1997  * stored inside using a translation table.
 1998  *
 1999  * File descriptors are copied over to the new file descriptor table,
 2000  * regardless of whether the close-on-exec flag is set.
 2001  */
 2002 int
 2003 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
 2004     struct filedesc **ret)
 2005 {
 2006         struct filedesc *newfdp;
 2007         struct filedescent *nfde, *ofde;
 2008         int error, i;
 2009 
 2010         MPASS(fdp != NULL);
 2011 
 2012         newfdp = fdinit(fdp, true);
 2013         if (nfds > fdp->fd_lastfile + 1) {
 2014                 /* New table cannot be larger than the old one. */
 2015                 error = E2BIG;
 2016                 goto bad;
 2017         }
 2018         /* Copy all passable descriptors (i.e. not kqueue). */
 2019         newfdp->fd_freefile = nfds;
 2020         for (i = 0; i < nfds; ++i) {
 2021                 if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
 2022                         /* File descriptor out of bounds. */
 2023                         error = EBADF;
 2024                         goto bad;
 2025                 }
 2026                 ofde = &fdp->fd_ofiles[fds[i]];
 2027                 if (ofde->fde_file == NULL) {
 2028                         /* Unused file descriptor. */
 2029                         error = EBADF;
 2030                         goto bad;
 2031                 }
 2032                 if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 2033                         /* File descriptor cannot be passed. */
 2034                         error = EINVAL;
 2035                         goto bad;
 2036                 }
 2037                 nfde = &newfdp->fd_ofiles[i];
 2038                 *nfde = *ofde;
 2039                 filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 2040                 fhold(nfde->fde_file);
 2041                 fdused_init(newfdp, i);
 2042                 newfdp->fd_lastfile = i;
 2043         }
 2044         newfdp->fd_cmask = fdp->fd_cmask;
 2045         FILEDESC_SUNLOCK(fdp);
 2046         *ret = newfdp;
 2047         return (0);
 2048 bad:
 2049         FILEDESC_SUNLOCK(fdp);
 2050         fdescfree_remapped(newfdp);
 2051         return (error);
 2052 }
 2053 
 2054 /*
 2055  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
 2056  * one of processes using it exits) and the table used to be shared.
 2057  */
 2058 static void
 2059 fdclearlocks(struct thread *td)
 2060 {
 2061         struct filedesc *fdp;
 2062         struct filedesc_to_leader *fdtol;
 2063         struct flock lf;
 2064         struct file *fp;
 2065         struct proc *p;
 2066         struct vnode *vp;
 2067         int i;
 2068 
 2069         p = td->td_proc;
 2070         fdp = p->p_fd;
 2071         fdtol = p->p_fdtol;
 2072         MPASS(fdtol != NULL);
 2073 
 2074         FILEDESC_XLOCK(fdp);
 2075         KASSERT(fdtol->fdl_refcount > 0,
 2076             ("filedesc_to_refcount botch: fdl_refcount=%d",
 2077             fdtol->fdl_refcount));
 2078         if (fdtol->fdl_refcount == 1 &&
 2079             (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 2080                 for (i = 0; i <= fdp->fd_lastfile; i++) {
 2081                         fp = fdp->fd_ofiles[i].fde_file;
 2082                         if (fp == NULL || fp->f_type != DTYPE_VNODE)
 2083                                 continue;
 2084                         fhold(fp);
 2085                         FILEDESC_XUNLOCK(fdp);
 2086                         lf.l_whence = SEEK_SET;
 2087                         lf.l_start = 0;
 2088                         lf.l_len = 0;
 2089                         lf.l_type = F_UNLCK;
 2090                         vp = fp->f_vnode;
 2091                         (void) VOP_ADVLOCK(vp,
 2092                             (caddr_t)p->p_leader, F_UNLCK,
 2093                             &lf, F_POSIX);
 2094                         FILEDESC_XLOCK(fdp);
 2095                         fdrop(fp, td);
 2096                 }
 2097         }
 2098 retry:
 2099         if (fdtol->fdl_refcount == 1) {
 2100                 if (fdp->fd_holdleaderscount > 0 &&
 2101                     (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 2102                         /*
 2103                          * close() or kern_dup() has cleared a reference
 2104                          * in a shared file descriptor table.
 2105                          */
 2106                         fdp->fd_holdleaderswakeup = 1;
 2107                         sx_sleep(&fdp->fd_holdleaderscount,
 2108                             FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 2109                         goto retry;
 2110                 }
 2111                 if (fdtol->fdl_holdcount > 0) {
 2112                         /*
 2113                          * Ensure that fdtol->fdl_leader remains
 2114                          * valid in closef().
 2115                          */
 2116                         fdtol->fdl_wakeup = 1;
 2117                         sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 2118                             "fdlhold", 0);
 2119                         goto retry;
 2120                 }
 2121         }
 2122         fdtol->fdl_refcount--;
 2123         if (fdtol->fdl_refcount == 0 &&
 2124             fdtol->fdl_holdcount == 0) {
 2125                 fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 2126                 fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 2127         } else
 2128                 fdtol = NULL;
 2129         p->p_fdtol = NULL;
 2130         FILEDESC_XUNLOCK(fdp);
 2131         if (fdtol != NULL)
 2132                 free(fdtol, M_FILEDESC_TO_LEADER);
 2133 }
 2134 
 2135 /*
 2136  * Release a filedesc structure.
 2137  */
 2138 static void
 2139 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 2140 {
 2141         struct filedesc0 *fdp0;
 2142         struct freetable *ft, *tft;
 2143         struct filedescent *fde;
 2144         struct file *fp;
 2145         int i;
 2146 
 2147         for (i = 0; i <= fdp->fd_lastfile; i++) {
 2148                 fde = &fdp->fd_ofiles[i];
 2149                 fp = fde->fde_file;
 2150                 if (fp != NULL) {
 2151                         fdefree_last(fde);
 2152                         if (needclose)
 2153                                 (void) closef(fp, td);
 2154                         else
 2155                                 fdrop(fp, td);
 2156                 }
 2157         }
 2158 
 2159         if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 2160                 free(fdp->fd_map, M_FILEDESC);
 2161         if (fdp->fd_nfiles > NDFILE)
 2162                 free(fdp->fd_files, M_FILEDESC);
 2163 
 2164         fdp0 = (struct filedesc0 *)fdp;
 2165         SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 2166                 free(ft->ft_table, M_FILEDESC);
 2167 
 2168         fddrop(fdp);
 2169 }
 2170 
 2171 void
 2172 fdescfree(struct thread *td)
 2173 {
 2174         struct proc *p;
 2175         struct filedesc *fdp;
 2176         struct vnode *cdir, *jdir, *rdir;
 2177 
 2178         p = td->td_proc;
 2179         fdp = p->p_fd;
 2180         MPASS(fdp != NULL);
 2181 
 2182 #ifdef RACCT
 2183         if (racct_enable) {
 2184                 PROC_LOCK(p);
 2185                 racct_set(p, RACCT_NOFILE, 0);
 2186                 PROC_UNLOCK(p);
 2187         }
 2188 #endif
 2189 
 2190         if (p->p_fdtol != NULL)
 2191                 fdclearlocks(td);
 2192 
 2193         PROC_LOCK(p);
 2194         p->p_fd = NULL;
 2195         PROC_UNLOCK(p);
 2196 
 2197         if (refcount_release(&fdp->fd_refcnt) == 0)
 2198                 return;
 2199 
 2200         FILEDESC_XLOCK(fdp);
 2201         cdir = fdp->fd_cdir;
 2202         fdp->fd_cdir = NULL;
 2203         rdir = fdp->fd_rdir;
 2204         fdp->fd_rdir = NULL;
 2205         jdir = fdp->fd_jdir;
 2206         fdp->fd_jdir = NULL;
 2207         FILEDESC_XUNLOCK(fdp);
 2208 
 2209         if (cdir != NULL)
 2210                 vrele(cdir);
 2211         if (rdir != NULL)
 2212                 vrele(rdir);
 2213         if (jdir != NULL)
 2214                 vrele(jdir);
 2215 
 2216         fdescfree_fds(td, fdp, 1);
 2217 }
 2218 
 2219 void
 2220 fdescfree_remapped(struct filedesc *fdp)
 2221 {
 2222 
 2223         if (fdp->fd_cdir != NULL)
 2224                 vrele(fdp->fd_cdir);
 2225         if (fdp->fd_rdir != NULL)
 2226                 vrele(fdp->fd_rdir);
 2227         if (fdp->fd_jdir != NULL)
 2228                 vrele(fdp->fd_jdir);
 2229 
 2230         fdescfree_fds(curthread, fdp, 0);
 2231 }
 2232 
 2233 /*
 2234  * For setugid programs, we don't want to people to use that setugidness
 2235  * to generate error messages which write to a file which otherwise would
 2236  * otherwise be off-limits to the process.  We check for filesystems where
 2237  * the vnode can change out from under us after execve (like [lin]procfs).
 2238  *
 2239  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
 2240  * sufficient.  We also don't check for setugidness since we know we are.
 2241  */
 2242 static bool
 2243 is_unsafe(struct file *fp)
 2244 {
 2245         struct vnode *vp;
 2246 
 2247         if (fp->f_type != DTYPE_VNODE)
 2248                 return (false);
 2249 
 2250         vp = fp->f_vnode;
 2251         return ((vp->v_vflag & VV_PROCDEP) != 0);
 2252 }
 2253 
 2254 /*
 2255  * Make this setguid thing safe, if at all possible.
 2256  */
 2257 void
 2258 fdsetugidsafety(struct thread *td)
 2259 {
 2260         struct filedesc *fdp;
 2261         struct file *fp;
 2262         int i;
 2263 
 2264         fdp = td->td_proc->p_fd;
 2265         KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 2266         MPASS(fdp->fd_nfiles >= 3);
 2267         for (i = 0; i <= 2; i++) {
 2268                 fp = fdp->fd_ofiles[i].fde_file;
 2269                 if (fp != NULL && is_unsafe(fp)) {
 2270                         FILEDESC_XLOCK(fdp);
 2271                         knote_fdclose(td, i);
 2272                         /*
 2273                          * NULL-out descriptor prior to close to avoid
 2274                          * a race while close blocks.
 2275                          */
 2276                         fdfree(fdp, i);
 2277                         FILEDESC_XUNLOCK(fdp);
 2278                         (void) closef(fp, td);
 2279                 }
 2280         }
 2281 }
 2282 
 2283 /*
 2284  * If a specific file object occupies a specific file descriptor, close the
 2285  * file descriptor entry and drop a reference on the file object.  This is a
 2286  * convenience function to handle a subsequent error in a function that calls
 2287  * falloc() that handles the race that another thread might have closed the
 2288  * file descriptor out from under the thread creating the file object.
 2289  */
 2290 void
 2291 fdclose(struct thread *td, struct file *fp, int idx)
 2292 {
 2293         struct filedesc *fdp = td->td_proc->p_fd;
 2294 
 2295         FILEDESC_XLOCK(fdp);
 2296         if (fdp->fd_ofiles[idx].fde_file == fp) {
 2297                 fdfree(fdp, idx);
 2298                 FILEDESC_XUNLOCK(fdp);
 2299                 fdrop(fp, td);
 2300         } else
 2301                 FILEDESC_XUNLOCK(fdp);
 2302 }
 2303 
 2304 /*
 2305  * Close any files on exec?
 2306  */
 2307 void
 2308 fdcloseexec(struct thread *td)
 2309 {
 2310         struct filedesc *fdp;
 2311         struct filedescent *fde;
 2312         struct file *fp;
 2313         int i;
 2314 
 2315         fdp = td->td_proc->p_fd;
 2316         KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 2317         for (i = 0; i <= fdp->fd_lastfile; i++) {
 2318                 fde = &fdp->fd_ofiles[i];
 2319                 fp = fde->fde_file;
 2320                 if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 2321                     (fde->fde_flags & UF_EXCLOSE))) {
 2322                         FILEDESC_XLOCK(fdp);
 2323                         fdfree(fdp, i);
 2324                         (void) closefp(fdp, i, fp, td, 0);
 2325                         FILEDESC_UNLOCK_ASSERT(fdp);
 2326                 }
 2327         }
 2328 }
 2329 
 2330 /*
 2331  * It is unsafe for set[ug]id processes to be started with file
 2332  * descriptors 0..2 closed, as these descriptors are given implicit
 2333  * significance in the Standard C library.  fdcheckstd() will create a
 2334  * descriptor referencing /dev/null for each of stdin, stdout, and
 2335  * stderr that is not already open.
 2336  */
 2337 int
 2338 fdcheckstd(struct thread *td)
 2339 {
 2340         struct filedesc *fdp;
 2341         register_t save;
 2342         int i, error, devnull;
 2343 
 2344         fdp = td->td_proc->p_fd;
 2345         KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 2346         MPASS(fdp->fd_nfiles >= 3);
 2347         devnull = -1;
 2348         for (i = 0; i <= 2; i++) {
 2349                 if (fdp->fd_ofiles[i].fde_file != NULL)
 2350                         continue;
 2351 
 2352                 save = td->td_retval[0];
 2353                 if (devnull != -1) {
 2354                         error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 2355                 } else {
 2356                         error = kern_openat(td, AT_FDCWD, "/dev/null",
 2357                             UIO_SYSSPACE, O_RDWR, 0);
 2358                         if (error == 0) {
 2359                                 devnull = td->td_retval[0];
 2360                                 KASSERT(devnull == i, ("we didn't get our fd"));
 2361                         }
 2362                 }
 2363                 td->td_retval[0] = save;
 2364                 if (error != 0)
 2365                         return (error);
 2366         }
 2367         return (0);
 2368 }
 2369 
 2370 /*
 2371  * Internal form of close.  Decrement reference count on file structure.
 2372  * Note: td may be NULL when closing a file that was being passed in a
 2373  * message.
 2374  *
 2375  * XXXRW: Giant is not required for the caller, but often will be held; this
 2376  * makes it moderately likely the Giant will be recursed in the VFS case.
 2377  */
 2378 int
 2379 closef(struct file *fp, struct thread *td)
 2380 {
 2381         struct vnode *vp;
 2382         struct flock lf;
 2383         struct filedesc_to_leader *fdtol;
 2384         struct filedesc *fdp;
 2385 
 2386         /*
 2387          * POSIX record locking dictates that any close releases ALL
 2388          * locks owned by this process.  This is handled by setting
 2389          * a flag in the unlock to free ONLY locks obeying POSIX
 2390          * semantics, and not to free BSD-style file locks.
 2391          * If the descriptor was in a message, POSIX-style locks
 2392          * aren't passed with the descriptor, and the thread pointer
 2393          * will be NULL.  Callers should be careful only to pass a
 2394          * NULL thread pointer when there really is no owning
 2395          * context that might have locks, or the locks will be
 2396          * leaked.
 2397          */
 2398         if (fp->f_type == DTYPE_VNODE && td != NULL) {
 2399                 vp = fp->f_vnode;
 2400                 if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 2401                         lf.l_whence = SEEK_SET;
 2402                         lf.l_start = 0;
 2403                         lf.l_len = 0;
 2404                         lf.l_type = F_UNLCK;
 2405                         (void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 2406                             F_UNLCK, &lf, F_POSIX);
 2407                 }
 2408                 fdtol = td->td_proc->p_fdtol;
 2409                 if (fdtol != NULL) {
 2410                         /*
 2411                          * Handle special case where file descriptor table is
 2412                          * shared between multiple process leaders.
 2413                          */
 2414                         fdp = td->td_proc->p_fd;
 2415                         FILEDESC_XLOCK(fdp);
 2416                         for (fdtol = fdtol->fdl_next;
 2417                             fdtol != td->td_proc->p_fdtol;
 2418                             fdtol = fdtol->fdl_next) {
 2419                                 if ((fdtol->fdl_leader->p_flag &
 2420                                     P_ADVLOCK) == 0)
 2421                                         continue;
 2422                                 fdtol->fdl_holdcount++;
 2423                                 FILEDESC_XUNLOCK(fdp);
 2424                                 lf.l_whence = SEEK_SET;
 2425                                 lf.l_start = 0;
 2426                                 lf.l_len = 0;
 2427                                 lf.l_type = F_UNLCK;
 2428                                 vp = fp->f_vnode;
 2429                                 (void) VOP_ADVLOCK(vp,
 2430                                     (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 2431                                     F_POSIX);
 2432                                 FILEDESC_XLOCK(fdp);
 2433                                 fdtol->fdl_holdcount--;
 2434                                 if (fdtol->fdl_holdcount == 0 &&
 2435                                     fdtol->fdl_wakeup != 0) {
 2436                                         fdtol->fdl_wakeup = 0;
 2437                                         wakeup(fdtol);
 2438                                 }
 2439                         }
 2440                         FILEDESC_XUNLOCK(fdp);
 2441                 }
 2442         }
 2443         return (fdrop(fp, td));
 2444 }
 2445 
 2446 /*
 2447  * Initialize the file pointer with the specified properties.
 2448  *
 2449  * The ops are set with release semantics to be certain that the flags, type,
 2450  * and data are visible when ops is.  This is to prevent ops methods from being
 2451  * called with bad data.
 2452  */
 2453 void
 2454 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 2455 {
 2456         fp->f_data = data;
 2457         fp->f_flag = flag;
 2458         fp->f_type = type;
 2459         atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 2460 }
 2461 
 2462 int
 2463 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 2464     struct file **fpp, struct filecaps *havecapsp)
 2465 {
 2466         struct filedescent *fde;
 2467         int error;
 2468 
 2469         FILEDESC_LOCK_ASSERT(fdp);
 2470 
 2471         fde = fdeget_locked(fdp, fd);
 2472         if (fde == NULL) {
 2473                 error = EBADF;
 2474                 goto out;
 2475         }
 2476 
 2477 #ifdef CAPABILITIES
 2478         error = cap_check(cap_rights_fde(fde), needrightsp);
 2479         if (error != 0)
 2480                 goto out;
 2481 #endif
 2482 
 2483         if (havecapsp != NULL)
 2484                 filecaps_copy(&fde->fde_caps, havecapsp, true);
 2485 
 2486         *fpp = fde->fde_file;
 2487 
 2488         error = 0;
 2489 out:
 2490         return (error);
 2491 }
 2492 
 2493 int
 2494 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
 2495     struct file **fpp, struct filecaps *havecapsp)
 2496 {
 2497         struct filedesc *fdp = td->td_proc->p_fd;
 2498         int error;
 2499 #ifndef CAPABILITIES
 2500         error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL);
 2501         if (error == 0 && havecapsp != NULL)
 2502                 filecaps_fill(havecapsp);
 2503 #else
 2504         struct file *fp;
 2505         seq_t seq;
 2506 
 2507         for (;;) {
 2508                 error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq);
 2509                 if (error != 0)
 2510                         return (error);
 2511 
 2512                 if (havecapsp != NULL) {
 2513                         if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 2514                             havecapsp, false)) {
 2515                                 fdrop(fp, td);
 2516                                 goto get_locked;
 2517                         }
 2518                 }
 2519 
 2520                 if (!fd_modified(fdp, fd, seq))
 2521                         break;
 2522                 fdrop(fp, td);
 2523         }
 2524 
 2525         *fpp = fp;
 2526         return (0);
 2527 
 2528 get_locked:
 2529         FILEDESC_SLOCK(fdp);
 2530         error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
 2531         if (error == 0)
 2532                 fhold(*fpp);
 2533         FILEDESC_SUNLOCK(fdp);
 2534 #endif
 2535         return (error);
 2536 }
 2537 
 2538 int
 2539 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 2540     struct file **fpp, seq_t *seqp)
 2541 {
 2542 #ifdef CAPABILITIES
 2543         struct filedescent *fde;
 2544 #endif
 2545         struct fdescenttbl *fdt;
 2546         struct file *fp;
 2547         u_int count;
 2548 #ifdef CAPABILITIES
 2549         seq_t seq;
 2550         cap_rights_t haverights;
 2551         int error;
 2552 #endif
 2553 
 2554         fdt = fdp->fd_files;
 2555         if ((u_int)fd >= fdt->fdt_nfiles)
 2556                 return (EBADF);
 2557         /*
 2558          * Fetch the descriptor locklessly.  We avoid fdrop() races by
 2559          * never raising a refcount above 0.  To accomplish this we have
 2560          * to use a cmpset loop rather than an atomic_add.  The descriptor
 2561          * must be re-verified once we acquire a reference to be certain
 2562          * that the identity is still correct and we did not lose a race
 2563          * due to preemption.
 2564          */
 2565         for (;;) {
 2566 #ifdef CAPABILITIES
 2567                 seq = seq_read(fd_seq(fdt, fd));
 2568                 fde = &fdt->fdt_ofiles[fd];
 2569                 haverights = *cap_rights_fde(fde);
 2570                 fp = fde->fde_file;
 2571                 if (!seq_consistent(fd_seq(fdt, fd), seq))
 2572                         continue;
 2573 #else
 2574                 fp = fdt->fdt_ofiles[fd].fde_file;
 2575 #endif
 2576                 if (fp == NULL)
 2577                         return (EBADF);
 2578 #ifdef CAPABILITIES
 2579                 error = cap_check(&haverights, needrightsp);
 2580                 if (error != 0)
 2581                         return (error);
 2582 #endif
 2583                 count = fp->f_count;
 2584         retry:
 2585                 if (count == 0) {
 2586                         /*
 2587                          * Force a reload. Other thread could reallocate the
 2588                          * table before this fd was closed, so it possible that
 2589                          * there is a stale fp pointer in cached version.
 2590                          */
 2591                         fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
 2592                         continue;
 2593                 }
 2594                 /*
 2595                  * Use an acquire barrier to force re-reading of fdt so it is
 2596                  * refreshed for verification.
 2597                  */
 2598                 if (atomic_fcmpset_acq_int(&fp->f_count, &count, count + 1) == 0)
 2599                         goto retry;
 2600                 fdt = fdp->fd_files;
 2601 #ifdef  CAPABILITIES
 2602                 if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
 2603 #else
 2604                 if (fp == fdt->fdt_ofiles[fd].fde_file)
 2605 #endif
 2606                         break;
 2607                 fdrop(fp, curthread);
 2608         }
 2609         *fpp = fp;
 2610         if (seqp != NULL) {
 2611 #ifdef CAPABILITIES
 2612                 *seqp = seq;
 2613 #endif
 2614         }
 2615         return (0);
 2616 }
 2617 
 2618 /*
 2619  * Extract the file pointer associated with the specified descriptor for the
 2620  * current user process.
 2621  *
 2622  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
 2623  * returned.
 2624  *
 2625  * File's rights will be checked against the capability rights mask.
 2626  *
 2627  * If an error occurred the non-zero error is returned and *fpp is set to
 2628  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
 2629  * responsible for fdrop().
 2630  */
 2631 static __inline int
 2632 _fget(struct thread *td, int fd, struct file **fpp, int flags,
 2633     cap_rights_t *needrightsp, seq_t *seqp)
 2634 {
 2635         struct filedesc *fdp;
 2636         struct file *fp;
 2637         int error;
 2638 
 2639         *fpp = NULL;
 2640         fdp = td->td_proc->p_fd;
 2641         error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
 2642         if (error != 0)
 2643                 return (error);
 2644         if (fp->f_ops == &badfileops) {
 2645                 fdrop(fp, td);
 2646                 return (EBADF);
 2647         }
 2648 
 2649         /*
 2650          * FREAD and FWRITE failure return EBADF as per POSIX.
 2651          */
 2652         error = 0;
 2653         switch (flags) {
 2654         case FREAD:
 2655         case FWRITE:
 2656                 if ((fp->f_flag & flags) == 0)
 2657                         error = EBADF;
 2658                 break;
 2659         case FEXEC:
 2660                 if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 2661                     ((fp->f_flag & FWRITE) != 0))
 2662                         error = EBADF;
 2663                 break;
 2664         case 0:
 2665                 break;
 2666         default:
 2667                 KASSERT(0, ("wrong flags"));
 2668         }
 2669 
 2670         if (error != 0) {
 2671                 fdrop(fp, td);
 2672                 return (error);
 2673         }
 2674 
 2675         *fpp = fp;
 2676         return (0);
 2677 }
 2678 
 2679 int
 2680 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 2681 {
 2682 
 2683         return (_fget(td, fd, fpp, 0, rightsp, NULL));
 2684 }
 2685 
 2686 int
 2687 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
 2688     struct file **fpp)
 2689 {
 2690         int error;
 2691 #ifndef CAPABILITIES
 2692         error = _fget(td, fd, fpp, 0, rightsp, NULL);
 2693         if (maxprotp != NULL)
 2694                 *maxprotp = VM_PROT_ALL;
 2695 #else
 2696         cap_rights_t fdrights;
 2697         struct filedesc *fdp = td->td_proc->p_fd;
 2698         seq_t seq;
 2699 
 2700         MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 2701         for (;;) {
 2702                 error = _fget(td, fd, fpp, 0, rightsp, &seq);
 2703                 if (error != 0)
 2704                         return (error);
 2705                 if (maxprotp != NULL)
 2706                         fdrights = *cap_rights(fdp, fd);
 2707                 if (!fd_modified(fdp, fd, seq))
 2708                         break;
 2709                 fdrop(*fpp, td);
 2710         }
 2711 
 2712         /*
 2713          * If requested, convert capability rights to access flags.
 2714          */
 2715         if (maxprotp != NULL)
 2716                 *maxprotp = cap_rights_to_vmprot(&fdrights);
 2717 #endif
 2718         return (error);
 2719 }
 2720 
 2721 int
 2722 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 2723 {
 2724 
 2725         return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
 2726 }
 2727 
 2728 int
 2729 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 2730 {
 2731 
 2732         return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 2733 }
 2734 
 2735 int
 2736 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
 2737     struct file **fpp)
 2738 {
 2739         struct filedesc *fdp = td->td_proc->p_fd;
 2740 #ifndef CAPABILITIES
 2741         return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
 2742 #else
 2743         int error;
 2744         seq_t seq;
 2745 
 2746         MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 2747         for (;;) {
 2748                 error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
 2749                 if (error != 0)
 2750                         return (error);
 2751                 error = cap_fcntl_check(fdp, fd, needfcntl);
 2752                 if (!fd_modified(fdp, fd, seq))
 2753                         break;
 2754                 fdrop(*fpp, td);
 2755         }
 2756         if (error != 0) {
 2757                 fdrop(*fpp, td);
 2758                 *fpp = NULL;
 2759         }
 2760         return (error);
 2761 #endif
 2762 }
 2763 
 2764 /*
 2765  * Like fget() but loads the underlying vnode, or returns an error if the
 2766  * descriptor does not represent a vnode.  Note that pipes use vnodes but
 2767  * never have VM objects.  The returned vnode will be vref()'d.
 2768  *
 2769  * XXX: what about the unused flags ?
 2770  */
 2771 static __inline int
 2772 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
 2773     struct vnode **vpp)
 2774 {
 2775         struct file *fp;
 2776         int error;
 2777 
 2778         *vpp = NULL;
 2779         error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 2780         if (error != 0)
 2781                 return (error);
 2782         if (fp->f_vnode == NULL) {
 2783                 error = EINVAL;
 2784         } else {
 2785                 *vpp = fp->f_vnode;
 2786                 vrefact(*vpp);
 2787         }
 2788         fdrop(fp, td);
 2789 
 2790         return (error);
 2791 }
 2792 
 2793 int
 2794 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 2795 {
 2796 
 2797         return (_fgetvp(td, fd, 0, rightsp, vpp));
 2798 }
 2799 
 2800 int
 2801 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
 2802     struct filecaps *havecaps, struct vnode **vpp)
 2803 {
 2804         struct filedesc *fdp;
 2805         struct filecaps caps;
 2806         struct file *fp;
 2807         int error;
 2808 
 2809         fdp = td->td_proc->p_fd;
 2810         error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps);
 2811         if (error != 0)
 2812                 return (error);
 2813         if (fp->f_ops == &badfileops) {
 2814                 error = EBADF;
 2815                 goto out;
 2816         }
 2817         if (fp->f_vnode == NULL) {
 2818                 error = EINVAL;
 2819                 goto out;
 2820         }
 2821 
 2822         *havecaps = caps;
 2823         *vpp = fp->f_vnode;
 2824         vrefact(*vpp);
 2825 
 2826         return (0);
 2827 out:
 2828         filecaps_free(&caps);
 2829         return (error);
 2830 }
 2831 
 2832 int
 2833 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 2834 {
 2835 
 2836         return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 2837 }
 2838 
 2839 int
 2840 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 2841 {
 2842 
 2843         return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 2844 }
 2845 
 2846 #ifdef notyet
 2847 int
 2848 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
 2849     struct vnode **vpp)
 2850 {
 2851 
 2852         return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 2853 }
 2854 #endif
 2855 
 2856 /*
 2857  * Handle the last reference to a file being closed.
 2858  */
 2859 int
 2860 _fdrop(struct file *fp, struct thread *td)
 2861 {
 2862         int error;
 2863 
 2864         if (fp->f_count != 0)
 2865                 panic("fdrop: count %d", fp->f_count);
 2866         error = fo_close(fp, td);
 2867         atomic_subtract_int(&openfiles, 1);
 2868         crfree(fp->f_cred);
 2869         free(fp->f_advice, M_FADVISE);
 2870         uma_zfree(file_zone, fp);
 2871 
 2872         return (error);
 2873 }
 2874 
 2875 /*
 2876  * Apply an advisory lock on a file descriptor.
 2877  *
 2878  * Just attempt to get a record lock of the requested type on the entire file
 2879  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
 2880  */
 2881 #ifndef _SYS_SYSPROTO_H_
 2882 struct flock_args {
 2883         int     fd;
 2884         int     how;
 2885 };
 2886 #endif
 2887 /* ARGSUSED */
 2888 int
 2889 sys_flock(struct thread *td, struct flock_args *uap)
 2890 {
 2891         struct file *fp;
 2892         struct vnode *vp;
 2893         struct flock lf;
 2894         cap_rights_t rights;
 2895         int error;
 2896 
 2897         error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 2898         if (error != 0)
 2899                 return (error);
 2900         if (fp->f_type != DTYPE_VNODE) {
 2901                 fdrop(fp, td);
 2902                 return (EOPNOTSUPP);
 2903         }
 2904 
 2905         vp = fp->f_vnode;
 2906         lf.l_whence = SEEK_SET;
 2907         lf.l_start = 0;
 2908         lf.l_len = 0;
 2909         if (uap->how & LOCK_UN) {
 2910                 lf.l_type = F_UNLCK;
 2911                 atomic_clear_int(&fp->f_flag, FHASLOCK);
 2912                 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 2913                 goto done2;
 2914         }
 2915         if (uap->how & LOCK_EX)
 2916                 lf.l_type = F_WRLCK;
 2917         else if (uap->how & LOCK_SH)
 2918                 lf.l_type = F_RDLCK;
 2919         else {
 2920                 error = EBADF;
 2921                 goto done2;
 2922         }
 2923         atomic_set_int(&fp->f_flag, FHASLOCK);
 2924         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 2925             (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 2926 done2:
 2927         fdrop(fp, td);
 2928         return (error);
 2929 }
 2930 /*
 2931  * Duplicate the specified descriptor to a free descriptor.
 2932  */
 2933 int
 2934 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
 2935     int openerror, int *indxp)
 2936 {
 2937         struct filedescent *newfde, *oldfde;
 2938         struct file *fp;
 2939         int error, indx;
 2940 
 2941         KASSERT(openerror == ENODEV || openerror == ENXIO,
 2942             ("unexpected error %d in %s", openerror, __func__));
 2943 
 2944         /*
 2945          * If the to-be-dup'd fd number is greater than the allowed number
 2946          * of file descriptors, or the fd to be dup'd has already been
 2947          * closed, then reject.
 2948          */
 2949         FILEDESC_XLOCK(fdp);
 2950         if ((fp = fget_locked(fdp, dfd)) == NULL) {
 2951                 FILEDESC_XUNLOCK(fdp);
 2952                 return (EBADF);
 2953         }
 2954 
 2955         error = fdalloc(td, 0, &indx);
 2956         if (error != 0) {
 2957                 FILEDESC_XUNLOCK(fdp);
 2958                 return (error);
 2959         }
 2960 
 2961         /*
 2962          * There are two cases of interest here.
 2963          *
 2964          * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 2965          *
 2966          * For ENXIO steal away the file structure from (dfd) and store it in
 2967          * (indx).  (dfd) is effectively closed by this operation.
 2968          */
 2969         switch (openerror) {
 2970         case ENODEV:
 2971                 /*
 2972                  * Check that the mode the file is being opened for is a
 2973                  * subset of the mode of the existing descriptor.
 2974                  */
 2975                 if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 2976                         fdunused(fdp, indx);
 2977                         FILEDESC_XUNLOCK(fdp);
 2978                         return (EACCES);
 2979                 }
 2980                 fhold(fp);
 2981                 newfde = &fdp->fd_ofiles[indx];
 2982                 oldfde = &fdp->fd_ofiles[dfd];
 2983 #ifdef CAPABILITIES
 2984                 seq_write_begin(&newfde->fde_seq);
 2985 #endif
 2986                 memcpy(newfde, oldfde, fde_change_size);
 2987                 filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 2988 #ifdef CAPABILITIES
 2989                 seq_write_end(&newfde->fde_seq);
 2990 #endif
 2991                 break;
 2992         case ENXIO:
 2993                 /*
 2994                  * Steal away the file pointer from dfd and stuff it into indx.
 2995                  */
 2996                 newfde = &fdp->fd_ofiles[indx];
 2997                 oldfde = &fdp->fd_ofiles[dfd];
 2998 #ifdef CAPABILITIES
 2999                 seq_write_begin(&newfde->fde_seq);
 3000 #endif
 3001                 memcpy(newfde, oldfde, fde_change_size);
 3002                 oldfde->fde_file = NULL;
 3003                 fdunused(fdp, dfd);
 3004 #ifdef CAPABILITIES
 3005                 seq_write_end(&newfde->fde_seq);
 3006 #endif
 3007                 break;
 3008         }
 3009         FILEDESC_XUNLOCK(fdp);
 3010         *indxp = indx;
 3011         return (0);
 3012 }
 3013 
 3014 /*
 3015  * This sysctl determines if we will allow a process to chroot(2) if it
 3016  * has a directory open:
 3017  *      0: disallowed for all processes.
 3018  *      1: allowed for processes that were not already chroot(2)'ed.
 3019  *      2: allowed for all processes.
 3020  */
 3021 
 3022 static int chroot_allow_open_directories = 1;
 3023 
 3024 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
 3025     &chroot_allow_open_directories, 0,
 3026     "Allow a process to chroot(2) if it has a directory open");
 3027 
 3028 /*
 3029  * Helper function for raised chroot(2) security function:  Refuse if
 3030  * any filedescriptors are open directories.
 3031  */
 3032 static int
 3033 chroot_refuse_vdir_fds(struct filedesc *fdp)
 3034 {
 3035         struct vnode *vp;
 3036         struct file *fp;
 3037         int fd;
 3038 
 3039         FILEDESC_LOCK_ASSERT(fdp);
 3040 
 3041         for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 3042                 fp = fget_locked(fdp, fd);
 3043                 if (fp == NULL)
 3044                         continue;
 3045                 if (fp->f_type == DTYPE_VNODE) {
 3046                         vp = fp->f_vnode;
 3047                         if (vp->v_type == VDIR)
 3048                                 return (EPERM);
 3049                 }
 3050         }
 3051         return (0);
 3052 }
 3053 
 3054 /*
 3055 * The caller is responsible for invoking priv_check() and
 3056 * mac_vnode_check_chroot() to authorize this operation.
 3057 */
 3058 int
 3059 pwd_chroot(struct thread *td, struct vnode *vp)
 3060 {
 3061         struct filedesc *fdp;
 3062         struct vnode *oldvp;
 3063         int error;
 3064 
 3065         fdp = td->td_proc->p_fd;
 3066         FILEDESC_XLOCK(fdp);
 3067         if (chroot_allow_open_directories == 0 ||
 3068             (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 3069                 error = chroot_refuse_vdir_fds(fdp);
 3070                 if (error != 0) {
 3071                         FILEDESC_XUNLOCK(fdp);
 3072                         return (error);
 3073                 }
 3074         }
 3075         oldvp = fdp->fd_rdir;
 3076         vrefact(vp);
 3077         fdp->fd_rdir = vp;
 3078         if (fdp->fd_jdir == NULL) {
 3079                 vrefact(vp);
 3080                 fdp->fd_jdir = vp;
 3081         }
 3082         FILEDESC_XUNLOCK(fdp);
 3083         vrele(oldvp);
 3084         return (0);
 3085 }
 3086 
 3087 void
 3088 pwd_chdir(struct thread *td, struct vnode *vp)
 3089 {
 3090         struct filedesc *fdp;
 3091         struct vnode *oldvp;
 3092 
 3093         fdp = td->td_proc->p_fd;
 3094         FILEDESC_XLOCK(fdp);
 3095         VNASSERT(vp->v_usecount > 0, vp,
 3096             ("chdir to a vnode with zero usecount"));
 3097         oldvp = fdp->fd_cdir;
 3098         fdp->fd_cdir = vp;
 3099         FILEDESC_XUNLOCK(fdp);
 3100         vrele(oldvp);
 3101 }
 3102 
 3103 /*
 3104  * jail_attach(2) changes both root and working directories.
 3105  */
 3106 int
 3107 pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 3108 {
 3109         struct filedesc *fdp;
 3110         struct vnode *oldvrp, *oldvcp;
 3111         int error;
 3112 
 3113         fdp = td->td_proc->p_fd;
 3114         FILEDESC_XLOCK(fdp);
 3115         error = chroot_refuse_vdir_fds(fdp);
 3116         if (error != 0) {
 3117                 FILEDESC_XUNLOCK(fdp);
 3118                 return (error);
 3119         }
 3120         oldvrp = fdp->fd_rdir;
 3121         vrefact(vp);
 3122         fdp->fd_rdir = vp;
 3123         oldvcp = fdp->fd_cdir;
 3124         vrefact(vp);
 3125         fdp->fd_cdir = vp;
 3126         if (fdp->fd_jdir == NULL) {
 3127                 vrefact(vp);
 3128                 fdp->fd_jdir = vp;
 3129         }
 3130         FILEDESC_XUNLOCK(fdp);
 3131         vrele(oldvrp);
 3132         vrele(oldvcp);
 3133         return (0);
 3134 }
 3135 
 3136 /*
 3137  * Scan all active processes and prisons to see if any of them have a current
 3138  * or root directory of `olddp'. If so, replace them with the new mount point.
 3139  */
 3140 void
 3141 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 3142 {
 3143         struct filedesc *fdp;
 3144         struct prison *pr;
 3145         struct proc *p;
 3146         int nrele;
 3147 
 3148         if (vrefcnt(olddp) == 1)
 3149                 return;
 3150         nrele = 0;
 3151         sx_slock(&allproc_lock);
 3152         FOREACH_PROC_IN_SYSTEM(p) {
 3153                 PROC_LOCK(p);
 3154                 fdp = fdhold(p);
 3155                 PROC_UNLOCK(p);
 3156                 if (fdp == NULL)
 3157                         continue;
 3158                 FILEDESC_XLOCK(fdp);
 3159                 if (fdp->fd_cdir == olddp) {
 3160                         vrefact(newdp);
 3161                         fdp->fd_cdir = newdp;
 3162                         nrele++;
 3163                 }
 3164                 if (fdp->fd_rdir == olddp) {
 3165                         vrefact(newdp);
 3166                         fdp->fd_rdir = newdp;
 3167                         nrele++;
 3168                 }
 3169                 if (fdp->fd_jdir == olddp) {
 3170                         vrefact(newdp);
 3171                         fdp->fd_jdir = newdp;
 3172                         nrele++;
 3173                 }
 3174                 FILEDESC_XUNLOCK(fdp);
 3175                 fddrop(fdp);
 3176         }
 3177         sx_sunlock(&allproc_lock);
 3178         if (rootvnode == olddp) {
 3179                 vrefact(newdp);
 3180                 rootvnode = newdp;
 3181                 nrele++;
 3182         }
 3183         mtx_lock(&prison0.pr_mtx);
 3184         if (prison0.pr_root == olddp) {
 3185                 vrefact(newdp);
 3186                 prison0.pr_root = newdp;
 3187                 nrele++;
 3188         }
 3189         mtx_unlock(&prison0.pr_mtx);
 3190         sx_slock(&allprison_lock);
 3191         TAILQ_FOREACH(pr, &allprison, pr_list) {
 3192                 mtx_lock(&pr->pr_mtx);
 3193                 if (pr->pr_root == olddp) {
 3194                         vrefact(newdp);
 3195                         pr->pr_root = newdp;
 3196                         nrele++;
 3197                 }
 3198                 mtx_unlock(&pr->pr_mtx);
 3199         }
 3200         sx_sunlock(&allprison_lock);
 3201         while (nrele--)
 3202                 vrele(olddp);
 3203 }
 3204 
 3205 struct filedesc_to_leader *
 3206 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 3207 {
 3208         struct filedesc_to_leader *fdtol;
 3209 
 3210         fdtol = malloc(sizeof(struct filedesc_to_leader),
 3211             M_FILEDESC_TO_LEADER, M_WAITOK);
 3212         fdtol->fdl_refcount = 1;
 3213         fdtol->fdl_holdcount = 0;
 3214         fdtol->fdl_wakeup = 0;
 3215         fdtol->fdl_leader = leader;
 3216         if (old != NULL) {
 3217                 FILEDESC_XLOCK(fdp);
 3218                 fdtol->fdl_next = old->fdl_next;
 3219                 fdtol->fdl_prev = old;
 3220                 old->fdl_next = fdtol;
 3221                 fdtol->fdl_next->fdl_prev = fdtol;
 3222                 FILEDESC_XUNLOCK(fdp);
 3223         } else {
 3224                 fdtol->fdl_next = fdtol;
 3225                 fdtol->fdl_prev = fdtol;
 3226         }
 3227         return (fdtol);
 3228 }
 3229 
 3230 static int
 3231 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 3232 {
 3233         struct filedesc *fdp;
 3234         int i, count, slots;
 3235 
 3236         if (*(int *)arg1 != 0)
 3237                 return (EINVAL);
 3238 
 3239         fdp = curproc->p_fd;
 3240         count = 0;
 3241         FILEDESC_SLOCK(fdp);
 3242         slots = NDSLOTS(fdp->fd_lastfile + 1);
 3243         for (i = 0; i < slots; i++)
 3244                 count += bitcountl(fdp->fd_map[i]);
 3245         FILEDESC_SUNLOCK(fdp);
 3246 
 3247         return (SYSCTL_OUT(req, &count, sizeof(count)));
 3248 }
 3249 
 3250 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
 3251     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
 3252     "Number of open file descriptors");
 3253 
 3254 /*
 3255  * Get file structures globally.
 3256  */
 3257 static int
 3258 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 3259 {
 3260         struct xfile xf;
 3261         struct filedesc *fdp;
 3262         struct file *fp;
 3263         struct proc *p;
 3264         int error, n;
 3265 
 3266         error = sysctl_wire_old_buffer(req, 0);
 3267         if (error != 0)
 3268                 return (error);
 3269         if (req->oldptr == NULL) {
 3270                 n = 0;
 3271                 sx_slock(&allproc_lock);
 3272                 FOREACH_PROC_IN_SYSTEM(p) {
 3273                         PROC_LOCK(p);
 3274                         if (p->p_state == PRS_NEW) {
 3275                                 PROC_UNLOCK(p);
 3276                                 continue;
 3277                         }
 3278                         fdp = fdhold(p);
 3279                         PROC_UNLOCK(p);
 3280                         if (fdp == NULL)
 3281                                 continue;
 3282                         /* overestimates sparse tables. */
 3283                         if (fdp->fd_lastfile > 0)
 3284                                 n += fdp->fd_lastfile;
 3285                         fddrop(fdp);
 3286                 }
 3287                 sx_sunlock(&allproc_lock);
 3288                 return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 3289         }
 3290         error = 0;
 3291         bzero(&xf, sizeof(xf));
 3292         xf.xf_size = sizeof(xf);
 3293         sx_slock(&allproc_lock);
 3294         FOREACH_PROC_IN_SYSTEM(p) {
 3295                 PROC_LOCK(p);
 3296                 if (p->p_state == PRS_NEW) {
 3297                         PROC_UNLOCK(p);
 3298                         continue;
 3299                 }
 3300                 if (p_cansee(req->td, p) != 0) {
 3301                         PROC_UNLOCK(p);
 3302                         continue;
 3303                 }
 3304                 xf.xf_pid = p->p_pid;
 3305                 xf.xf_uid = p->p_ucred->cr_uid;
 3306                 fdp = fdhold(p);
 3307                 PROC_UNLOCK(p);
 3308                 if (fdp == NULL)
 3309                         continue;
 3310                 FILEDESC_SLOCK(fdp);
 3311                 for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 3312                         if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 3313                                 continue;
 3314                         xf.xf_fd = n;
 3315                         xf.xf_file = fp;
 3316                         xf.xf_data = fp->f_data;
 3317                         xf.xf_vnode = fp->f_vnode;
 3318                         xf.xf_type = fp->f_type;
 3319                         xf.xf_count = fp->f_count;
 3320                         xf.xf_msgcount = 0;
 3321                         xf.xf_offset = foffset_get(fp);
 3322                         xf.xf_flag = fp->f_flag;
 3323                         error = SYSCTL_OUT(req, &xf, sizeof(xf));
 3324                         if (error)
 3325                                 break;
 3326                 }
 3327                 FILEDESC_SUNLOCK(fdp);
 3328                 fddrop(fdp);
 3329                 if (error)
 3330                         break;
 3331         }
 3332         sx_sunlock(&allproc_lock);
 3333         return (error);
 3334 }
 3335 
 3336 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
 3337     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 3338 
 3339 #ifdef KINFO_FILE_SIZE
 3340 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 3341 #endif
 3342 
 3343 static int
 3344 xlate_fflags(int fflags)
 3345 {
 3346         static const struct {
 3347                 int     fflag;
 3348                 int     kf_fflag;
 3349         } fflags_table[] = {
 3350                 { FAPPEND, KF_FLAG_APPEND },
 3351                 { FASYNC, KF_FLAG_ASYNC },
 3352                 { FFSYNC, KF_FLAG_FSYNC },
 3353                 { FHASLOCK, KF_FLAG_HASLOCK },
 3354                 { FNONBLOCK, KF_FLAG_NONBLOCK },
 3355                 { FREAD, KF_FLAG_READ },
 3356                 { FWRITE, KF_FLAG_WRITE },
 3357                 { O_CREAT, KF_FLAG_CREAT },
 3358                 { O_DIRECT, KF_FLAG_DIRECT },
 3359                 { O_EXCL, KF_FLAG_EXCL },
 3360                 { O_EXEC, KF_FLAG_EXEC },
 3361                 { O_EXLOCK, KF_FLAG_EXLOCK },
 3362                 { O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 3363                 { O_SHLOCK, KF_FLAG_SHLOCK },
 3364                 { O_TRUNC, KF_FLAG_TRUNC }
 3365         };
 3366         unsigned int i;
 3367         int kflags;
 3368 
 3369         kflags = 0;
 3370         for (i = 0; i < nitems(fflags_table); i++)
 3371                 if (fflags & fflags_table[i].fflag)
 3372                         kflags |=  fflags_table[i].kf_fflag;
 3373         return (kflags);
 3374 }
 3375 
 3376 /* Trim unused data from kf_path by truncating the structure size. */
 3377 static void
 3378 pack_kinfo(struct kinfo_file *kif)
 3379 {
 3380 
 3381         kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 3382             strlen(kif->kf_path) + 1;
 3383         kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 3384 }
 3385 
 3386 static void
 3387 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
 3388     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 3389 {
 3390         int error;
 3391 
 3392         bzero(kif, sizeof(*kif));
 3393 
 3394         /* Set a default type to allow for empty fill_kinfo() methods. */
 3395         kif->kf_type = KF_TYPE_UNKNOWN;
 3396         kif->kf_flags = xlate_fflags(fp->f_flag);
 3397         if (rightsp != NULL)
 3398                 kif->kf_cap_rights = *rightsp;
 3399         else
 3400                 cap_rights_init(&kif->kf_cap_rights);
 3401         kif->kf_fd = fd;
 3402         kif->kf_ref_count = fp->f_count;
 3403         kif->kf_offset = foffset_get(fp);
 3404 
 3405         /*
 3406          * This may drop the filedesc lock, so the 'fp' cannot be
 3407          * accessed after this call.
 3408          */
 3409         error = fo_fill_kinfo(fp, kif, fdp);
 3410         if (error == 0)
 3411                 kif->kf_status |= KF_ATTR_VALID;
 3412         if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 3413                 pack_kinfo(kif);
 3414         else
 3415                 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 3416 }
 3417 
 3418 static void
 3419 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
 3420     struct kinfo_file *kif, int flags)
 3421 {
 3422         int error;
 3423 
 3424         bzero(kif, sizeof(*kif));
 3425 
 3426         kif->kf_type = KF_TYPE_VNODE;
 3427         error = vn_fill_kinfo_vnode(vp, kif);
 3428         if (error == 0)
 3429                 kif->kf_status |= KF_ATTR_VALID;
 3430         kif->kf_flags = xlate_fflags(fflags);
 3431         cap_rights_init(&kif->kf_cap_rights);
 3432         kif->kf_fd = fd;
 3433         kif->kf_ref_count = -1;
 3434         kif->kf_offset = -1;
 3435         if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 3436                 pack_kinfo(kif);
 3437         else
 3438                 kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 3439         vrele(vp);
 3440 }
 3441 
 3442 struct export_fd_buf {
 3443         struct filedesc         *fdp;
 3444         struct sbuf             *sb;
 3445         ssize_t                 remainder;
 3446         struct kinfo_file       kif;
 3447         int                     flags;
 3448 };
 3449 
 3450 static int
 3451 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 3452 {
 3453         struct kinfo_file *kif;
 3454 
 3455         kif = &efbuf->kif;
 3456         if (efbuf->remainder != -1) {
 3457                 if (efbuf->remainder < kif->kf_structsize) {
 3458                         /* Terminate export. */
 3459                         efbuf->remainder = 0;
 3460                         return (0);
 3461                 }
 3462                 efbuf->remainder -= kif->kf_structsize;
 3463         }
 3464         return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 3465 }
 3466 
 3467 static int
 3468 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
 3469     struct export_fd_buf *efbuf)
 3470 {
 3471         int error;
 3472 
 3473         if (efbuf->remainder == 0)
 3474                 return (0);
 3475         export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 3476             efbuf->flags);
 3477         FILEDESC_SUNLOCK(efbuf->fdp);
 3478         error = export_kinfo_to_sb(efbuf);
 3479         FILEDESC_SLOCK(efbuf->fdp);
 3480         return (error);
 3481 }
 3482 
 3483 static int
 3484 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
 3485     struct export_fd_buf *efbuf)
 3486 {
 3487         int error;
 3488 
 3489         if (efbuf->remainder == 0)
 3490                 return (0);
 3491         if (efbuf->fdp != NULL)
 3492                 FILEDESC_SUNLOCK(efbuf->fdp);
 3493         export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 3494         error = export_kinfo_to_sb(efbuf);
 3495         if (efbuf->fdp != NULL)
 3496                 FILEDESC_SLOCK(efbuf->fdp);
 3497         return (error);
 3498 }
 3499 
 3500 /*
 3501  * Store a process file descriptor information to sbuf.
 3502  *
 3503  * Takes a locked proc as argument, and returns with the proc unlocked.
 3504  */
 3505 int
 3506 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
 3507     int flags)
 3508 {
 3509         struct file *fp;
 3510         struct filedesc *fdp;
 3511         struct export_fd_buf *efbuf;
 3512         struct vnode *cttyvp, *textvp, *tracevp;
 3513         int error, i;
 3514         cap_rights_t rights;
 3515 
 3516         PROC_LOCK_ASSERT(p, MA_OWNED);
 3517 
 3518         /* ktrace vnode */
 3519         tracevp = p->p_tracevp;
 3520         if (tracevp != NULL)
 3521                 vrefact(tracevp);
 3522         /* text vnode */
 3523         textvp = p->p_textvp;
 3524         if (textvp != NULL)
 3525                 vrefact(textvp);
 3526         /* Controlling tty. */
 3527         cttyvp = NULL;
 3528         if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 3529                 cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 3530                 if (cttyvp != NULL)
 3531                         vrefact(cttyvp);
 3532         }
 3533         fdp = fdhold(p);
 3534         PROC_UNLOCK(p);
 3535         efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 3536         efbuf->fdp = NULL;
 3537         efbuf->sb = sb;
 3538         efbuf->remainder = maxlen;
 3539         efbuf->flags = flags;
 3540         if (tracevp != NULL)
 3541                 export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 3542                     efbuf);
 3543         if (textvp != NULL)
 3544                 export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 3545         if (cttyvp != NULL)
 3546                 export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 3547                     efbuf);
 3548         error = 0;
 3549         if (fdp == NULL)
 3550                 goto fail;
 3551         efbuf->fdp = fdp;
 3552         FILEDESC_SLOCK(fdp);
 3553         /* working directory */
 3554         if (fdp->fd_cdir != NULL) {
 3555                 vrefact(fdp->fd_cdir);
 3556                 export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 3557         }
 3558         /* root directory */
 3559         if (fdp->fd_rdir != NULL) {
 3560                 vrefact(fdp->fd_rdir);
 3561                 export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 3562         }
 3563         /* jail directory */
 3564         if (fdp->fd_jdir != NULL) {
 3565                 vrefact(fdp->fd_jdir);
 3566                 export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 3567         }
 3568         for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 3569                 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 3570                         continue;
 3571 #ifdef CAPABILITIES
 3572                 rights = *cap_rights(fdp, i);
 3573 #else /* !CAPABILITIES */
 3574                 cap_rights_init(&rights);
 3575 #endif
 3576                 /*
 3577                  * Create sysctl entry.  It is OK to drop the filedesc
 3578                  * lock inside of export_file_to_sb() as we will
 3579                  * re-validate and re-evaluate its properties when the
 3580                  * loop continues.
 3581                  */
 3582                 error = export_file_to_sb(fp, i, &rights, efbuf);
 3583                 if (error != 0 || efbuf->remainder == 0)
 3584                         break;
 3585         }
 3586         FILEDESC_SUNLOCK(fdp);
 3587         fddrop(fdp);
 3588 fail:
 3589         free(efbuf, M_TEMP);
 3590         return (error);
 3591 }
 3592 
 3593 #define FILEDESC_SBUF_SIZE      (sizeof(struct kinfo_file) * 5)
 3594 
 3595 /*
 3596  * Get per-process file descriptors for use by procstat(1), et al.
 3597  */
 3598 static int
 3599 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 3600 {
 3601         struct sbuf sb;
 3602         struct proc *p;
 3603         ssize_t maxlen;
 3604         int error, error2, *name;
 3605 
 3606         name = (int *)arg1;
 3607 
 3608         sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 3609         sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 3610         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 3611         if (error != 0) {
 3612                 sbuf_delete(&sb);
 3613                 return (error);
 3614         }
 3615         maxlen = req->oldptr != NULL ? req->oldlen : -1;
 3616         error = kern_proc_filedesc_out(p, &sb, maxlen,
 3617             KERN_FILEDESC_PACK_KINFO);
 3618         error2 = sbuf_finish(&sb);
 3619         sbuf_delete(&sb);
 3620         return (error != 0 ? error : error2);
 3621 }
 3622 
 3623 #ifdef KINFO_OFILE_SIZE
 3624 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 3625 #endif
 3626 
 3627 #ifdef COMPAT_FREEBSD7
 3628 static void
 3629 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 3630 {
 3631 
 3632         okif->kf_structsize = sizeof(*okif);
 3633         okif->kf_type = kif->kf_type;
 3634         okif->kf_fd = kif->kf_fd;
 3635         okif->kf_ref_count = kif->kf_ref_count;
 3636         okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 3637             KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 3638             KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 3639         okif->kf_offset = kif->kf_offset;
 3640         okif->kf_vnode_type = kif->kf_vnode_type;
 3641         okif->kf_sock_domain = kif->kf_sock_domain;
 3642         okif->kf_sock_type = kif->kf_sock_type;
 3643         okif->kf_sock_protocol = kif->kf_sock_protocol;
 3644         strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 3645         okif->kf_sa_local = kif->kf_sa_local;
 3646         okif->kf_sa_peer = kif->kf_sa_peer;
 3647 }
 3648 
 3649 static int
 3650 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
 3651     struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
 3652 {
 3653         int error;
 3654 
 3655         vrefact(vp);
 3656         FILEDESC_SUNLOCK(fdp);
 3657         export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 3658         kinfo_to_okinfo(kif, okif);
 3659         error = SYSCTL_OUT(req, okif, sizeof(*okif));
 3660         FILEDESC_SLOCK(fdp);
 3661         return (error);
 3662 }
 3663 
 3664 /*
 3665  * Get per-process file descriptors for use by procstat(1), et al.
 3666  */
 3667 static int
 3668 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 3669 {
 3670         struct kinfo_ofile *okif;
 3671         struct kinfo_file *kif;
 3672         struct filedesc *fdp;
 3673         int error, i, *name;
 3674         struct file *fp;
 3675         struct proc *p;
 3676 
 3677         name = (int *)arg1;
 3678         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 3679         if (error != 0)
 3680                 return (error);
 3681         fdp = fdhold(p);
 3682         PROC_UNLOCK(p);
 3683         if (fdp == NULL)
 3684                 return (ENOENT);
 3685         kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 3686         okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 3687         FILEDESC_SLOCK(fdp);
 3688         if (fdp->fd_cdir != NULL)
 3689                 export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 3690                     okif, fdp, req);
 3691         if (fdp->fd_rdir != NULL)
 3692                 export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 3693                     okif, fdp, req);
 3694         if (fdp->fd_jdir != NULL)
 3695                 export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 3696                     okif, fdp, req);
 3697         for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 3698                 if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 3699                         continue;
 3700                 export_file_to_kinfo(fp, i, NULL, kif, fdp,
 3701                     KERN_FILEDESC_PACK_KINFO);
 3702                 FILEDESC_SUNLOCK(fdp);
 3703                 kinfo_to_okinfo(kif, okif);
 3704                 error = SYSCTL_OUT(req, okif, sizeof(*okif));
 3705                 FILEDESC_SLOCK(fdp);
 3706                 if (error)
 3707                         break;
 3708         }
 3709         FILEDESC_SUNLOCK(fdp);
 3710         fddrop(fdp);
 3711         free(kif, M_TEMP);
 3712         free(okif, M_TEMP);
 3713         return (0);
 3714 }
 3715 
 3716 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
 3717     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
 3718     "Process ofiledesc entries");
 3719 #endif  /* COMPAT_FREEBSD7 */
 3720 
 3721 int
 3722 vntype_to_kinfo(int vtype)
 3723 {
 3724         struct {
 3725                 int     vtype;
 3726                 int     kf_vtype;
 3727         } vtypes_table[] = {
 3728                 { VBAD, KF_VTYPE_VBAD },
 3729                 { VBLK, KF_VTYPE_VBLK },
 3730                 { VCHR, KF_VTYPE_VCHR },
 3731                 { VDIR, KF_VTYPE_VDIR },
 3732                 { VFIFO, KF_VTYPE_VFIFO },
 3733                 { VLNK, KF_VTYPE_VLNK },
 3734                 { VNON, KF_VTYPE_VNON },
 3735                 { VREG, KF_VTYPE_VREG },
 3736                 { VSOCK, KF_VTYPE_VSOCK }
 3737         };
 3738         unsigned int i;
 3739 
 3740         /*
 3741          * Perform vtype translation.
 3742          */
 3743         for (i = 0; i < nitems(vtypes_table); i++)
 3744                 if (vtypes_table[i].vtype == vtype)
 3745                         return (vtypes_table[i].kf_vtype);
 3746 
 3747         return (KF_VTYPE_UNKNOWN);
 3748 }
 3749 
 3750 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
 3751     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
 3752     "Process filedesc entries");
 3753 
 3754 /*
 3755  * Store a process current working directory information to sbuf.
 3756  *
 3757  * Takes a locked proc as argument, and returns with the proc unlocked.
 3758  */
 3759 int
 3760 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 3761 {
 3762         struct filedesc *fdp;
 3763         struct export_fd_buf *efbuf;
 3764         int error;
 3765 
 3766         PROC_LOCK_ASSERT(p, MA_OWNED);
 3767 
 3768         fdp = fdhold(p);
 3769         PROC_UNLOCK(p);
 3770         if (fdp == NULL)
 3771                 return (EINVAL);
 3772 
 3773         efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 3774         efbuf->fdp = fdp;
 3775         efbuf->sb = sb;
 3776         efbuf->remainder = maxlen;
 3777 
 3778         FILEDESC_SLOCK(fdp);
 3779         if (fdp->fd_cdir == NULL)
 3780                 error = EINVAL;
 3781         else {
 3782                 vrefact(fdp->fd_cdir);
 3783                 error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
 3784                     FREAD, efbuf);
 3785         }
 3786         FILEDESC_SUNLOCK(fdp);
 3787         fddrop(fdp);
 3788         free(efbuf, M_TEMP);
 3789         return (error);
 3790 }
 3791 
 3792 /*
 3793  * Get per-process current working directory.
 3794  */
 3795 static int
 3796 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 3797 {
 3798         struct sbuf sb;
 3799         struct proc *p;
 3800         ssize_t maxlen;
 3801         int error, error2, *name;
 3802 
 3803         name = (int *)arg1;
 3804 
 3805         sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 3806         sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 3807         error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 3808         if (error != 0) {
 3809                 sbuf_delete(&sb);
 3810                 return (error);
 3811         }
 3812         maxlen = req->oldptr != NULL ? req->oldlen : -1;
 3813         error = kern_proc_cwd_out(p, &sb, maxlen);
 3814         error2 = sbuf_finish(&sb);
 3815         sbuf_delete(&sb);
 3816         return (error != 0 ? error : error2);
 3817 }
 3818 
 3819 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
 3820     sysctl_kern_proc_cwd, "Process current working directory");
 3821 
 3822 #ifdef DDB
 3823 /*
 3824  * For the purposes of debugging, generate a human-readable string for the
 3825  * file type.
 3826  */
 3827 static const char *
 3828 file_type_to_name(short type)
 3829 {
 3830 
 3831         switch (type) {
 3832         case 0:
 3833                 return ("zero");
 3834         case DTYPE_VNODE:
 3835                 return ("vnod");
 3836         case DTYPE_SOCKET:
 3837                 return ("sock");
 3838         case DTYPE_PIPE:
 3839                 return ("pipe");
 3840         case DTYPE_FIFO:
 3841                 return ("fifo");
 3842         case DTYPE_KQUEUE:
 3843                 return ("kque");
 3844         case DTYPE_CRYPTO:
 3845                 return ("crpt");
 3846         case DTYPE_MQUEUE:
 3847                 return ("mque");
 3848         case DTYPE_SHM:
 3849                 return ("shm");
 3850         case DTYPE_SEM:
 3851                 return ("ksem");
 3852         default:
 3853                 return ("unkn");
 3854         }
 3855 }
 3856 
 3857 /*
 3858  * For the purposes of debugging, identify a process (if any, perhaps one of
 3859  * many) that references the passed file in its file descriptor array. Return
 3860  * NULL if none.
 3861  */
 3862 static struct proc *
 3863 file_to_first_proc(struct file *fp)
 3864 {
 3865         struct filedesc *fdp;
 3866         struct proc *p;
 3867         int n;
 3868 
 3869         FOREACH_PROC_IN_SYSTEM(p) {
 3870                 if (p->p_state == PRS_NEW)
 3871                         continue;
 3872                 fdp = p->p_fd;
 3873                 if (fdp == NULL)
 3874                         continue;
 3875                 for (n = 0; n <= fdp->fd_lastfile; n++) {
 3876                         if (fp == fdp->fd_ofiles[n].fde_file)
 3877                                 return (p);
 3878                 }
 3879         }
 3880         return (NULL);
 3881 }
 3882 
 3883 static void
 3884 db_print_file(struct file *fp, int header)
 3885 {
 3886         struct proc *p;
 3887 
 3888         if (header)
 3889                 db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 3890                     "File", "Type", "Data", "Flag", "GCFl", "Count",
 3891                     "MCount", "Vnode", "FPID", "FCmd");
 3892         p = file_to_first_proc(fp);
 3893         db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 3894             file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 3895             0, fp->f_count, 0, fp->f_vnode,
 3896             p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 3897 }
 3898 
 3899 DB_SHOW_COMMAND(file, db_show_file)
 3900 {
 3901         struct file *fp;
 3902 
 3903         if (!have_addr) {
 3904                 db_printf("usage: show file <addr>\n");
 3905                 return;
 3906         }
 3907         fp = (struct file *)addr;
 3908         db_print_file(fp, 1);
 3909 }
 3910 
 3911 DB_SHOW_COMMAND(files, db_show_files)
 3912 {
 3913         struct filedesc *fdp;
 3914         struct file *fp;
 3915         struct proc *p;
 3916         int header;
 3917         int n;
 3918 
 3919         header = 1;
 3920         FOREACH_PROC_IN_SYSTEM(p) {
 3921                 if (p->p_state == PRS_NEW)
 3922                         continue;
 3923                 if ((fdp = p->p_fd) == NULL)
 3924                         continue;
 3925                 for (n = 0; n <= fdp->fd_lastfile; ++n) {
 3926                         if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 3927                                 continue;
 3928                         db_print_file(fp, header);
 3929                         header = 0;
 3930                 }
 3931         }
 3932 }
 3933 #endif
 3934 
 3935 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
 3936     &maxfilesperproc, 0, "Maximum files allowed open per process");
 3937 
 3938 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
 3939     &maxfiles, 0, "Maximum number of files");
 3940 
 3941 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
 3942     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 3943 
 3944 /* ARGSUSED*/
 3945 static void
 3946 filelistinit(void *dummy)
 3947 {
 3948 
 3949         file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 3950             NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 3951         filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 3952             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 3953         mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 3954 }
 3955 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 3956 
 3957 /*-------------------------------------------------------------------*/
 3958 
 3959 static int
 3960 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
 3961     int flags, struct thread *td)
 3962 {
 3963 
 3964         return (EBADF);
 3965 }
 3966 
 3967 static int
 3968 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 3969     struct thread *td)
 3970 {
 3971 
 3972         return (EINVAL);
 3973 }
 3974 
 3975 static int
 3976 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
 3977     struct thread *td)
 3978 {
 3979 
 3980         return (EBADF);
 3981 }
 3982 
 3983 static int
 3984 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
 3985     struct thread *td)
 3986 {
 3987 
 3988         return (0);
 3989 }
 3990 
 3991 static int
 3992 badfo_kqfilter(struct file *fp, struct knote *kn)
 3993 {
 3994 
 3995         return (EBADF);
 3996 }
 3997 
 3998 static int
 3999 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
 4000     struct thread *td)
 4001 {
 4002 
 4003         return (EBADF);
 4004 }
 4005 
 4006 static int
 4007 badfo_close(struct file *fp, struct thread *td)
 4008 {
 4009 
 4010         return (0);
 4011 }
 4012 
 4013 static int
 4014 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 4015     struct thread *td)
 4016 {
 4017 
 4018         return (EBADF);
 4019 }
 4020 
 4021 static int
 4022 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 4023     struct thread *td)
 4024 {
 4025 
 4026         return (EBADF);
 4027 }
 4028 
 4029 static int
 4030 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 4031     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
 4032     struct thread *td)
 4033 {
 4034 
 4035         return (EBADF);
 4036 }
 4037 
 4038 static int
 4039 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 4040 {
 4041 
 4042         return (0);
 4043 }
 4044 
 4045 struct fileops badfileops = {
 4046         .fo_read = badfo_readwrite,
 4047         .fo_write = badfo_readwrite,
 4048         .fo_truncate = badfo_truncate,
 4049         .fo_ioctl = badfo_ioctl,
 4050         .fo_poll = badfo_poll,
 4051         .fo_kqfilter = badfo_kqfilter,
 4052         .fo_stat = badfo_stat,
 4053         .fo_close = badfo_close,
 4054         .fo_chmod = badfo_chmod,
 4055         .fo_chown = badfo_chown,
 4056         .fo_sendfile = badfo_sendfile,
 4057         .fo_fill_kinfo = badfo_fill_kinfo,
 4058 };
 4059 
 4060 int
 4061 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
 4062     int flags, struct thread *td)
 4063 {
 4064 
 4065         return (EOPNOTSUPP);
 4066 }
 4067 
 4068 int
 4069 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 4070     struct thread *td)
 4071 {
 4072 
 4073         return (EINVAL);
 4074 }
 4075 
 4076 int
 4077 invfo_ioctl(struct file *fp, u_long com, void *data,
 4078     struct ucred *active_cred, struct thread *td)
 4079 {
 4080 
 4081         return (ENOTTY);
 4082 }
 4083 
 4084 int
 4085 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
 4086     struct thread *td)
 4087 {
 4088 
 4089         return (poll_no_poll(events));
 4090 }
 4091 
 4092 int
 4093 invfo_kqfilter(struct file *fp, struct knote *kn)
 4094 {
 4095 
 4096         return (EINVAL);
 4097 }
 4098 
 4099 int
 4100 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 4101     struct thread *td)
 4102 {
 4103 
 4104         return (EINVAL);
 4105 }
 4106 
 4107 int
 4108 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 4109     struct thread *td)
 4110 {
 4111 
 4112         return (EINVAL);
 4113 }
 4114 
 4115 int
 4116 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 4117     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
 4118     struct thread *td)
 4119 {
 4120 
 4121         return (EINVAL);
 4122 }
 4123 
 4124 /*-------------------------------------------------------------------*/
 4125 
 4126 /*
 4127  * File Descriptor pseudo-device driver (/dev/fd/).
 4128  *
 4129  * Opening minor device N dup()s the file (if any) connected to file
 4130  * descriptor N belonging to the calling process.  Note that this driver
 4131  * consists of only the ``open()'' routine, because all subsequent
 4132  * references to this file will be direct to the other driver.
 4133  *
 4134  * XXX: we could give this one a cloning event handler if necessary.
 4135  */
 4136 
 4137 /* ARGSUSED */
 4138 static int
 4139 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 4140 {
 4141 
 4142         /*
 4143          * XXX Kludge: set curthread->td_dupfd to contain the value of the
 4144          * the file descriptor being sought for duplication. The error
 4145          * return ensures that the vnode for this device will be released
 4146          * by vn_open. Open will detect this special error and take the
 4147          * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 4148          * will simply report the error.
 4149          */
 4150         td->td_dupfd = dev2unit(dev);
 4151         return (ENODEV);
 4152 }
 4153 
 4154 static struct cdevsw fildesc_cdevsw = {
 4155         .d_version =    D_VERSION,
 4156         .d_open =       fdopen,
 4157         .d_name =       "FD",
 4158 };
 4159 
 4160 static void
 4161 fildesc_drvinit(void *unused)
 4162 {
 4163         struct cdev *dev;
 4164 
 4165         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 4166             UID_ROOT, GID_WHEEL, 0666, "fd/0");
 4167         make_dev_alias(dev, "stdin");
 4168         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 4169             UID_ROOT, GID_WHEEL, 0666, "fd/1");
 4170         make_dev_alias(dev, "stdout");
 4171         dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 4172             UID_ROOT, GID_WHEEL, 0666, "fd/2");
 4173         make_dev_alias(dev, "stderr");
 4174 }
 4175 
 4176 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Cache object: 5827b82c9f47357242355706407710eb
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/kern_descrip.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_descrip.c