vfs_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: vfs_vnops.c,v 1.235 2022/08/06 21:21:10 riastradh Exp $        */
    2 
    3 /*-
    4  * Copyright (c) 2009 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Andrew Doran.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   29  * POSSIBILITY OF SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  * Copyright (c) 1982, 1986, 1989, 1993
   34  *      The Regents of the University of California.  All rights reserved.
   35  * (c) UNIX System Laboratories, Inc.
   36  * All or some portions of this file are derived from material licensed
   37  * to the University of California by American Telephone and Telegraph
   38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   39  * the permission of UNIX System Laboratories, Inc.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the University nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  *
   65  *      @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95
   66  */
   67 
   68 #include <sys/cdefs.h>
   69 __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c,v 1.235 2022/08/06 21:21:10 riastradh Exp $");
   70 
   71 #include "veriexec.h"
   72 
   73 #include <sys/param.h>
   74 #include <sys/systm.h>
   75 #include <sys/kernel.h>
   76 #include <sys/file.h>
   77 #include <sys/stat.h>
   78 #include <sys/buf.h>
   79 #include <sys/proc.h>
   80 #include <sys/mount.h>
   81 #include <sys/namei.h>
   82 #include <sys/vnode_impl.h>
   83 #include <sys/ioctl.h>
   84 #include <sys/tty.h>
   85 #include <sys/poll.h>
   86 #include <sys/kauth.h>
   87 #include <sys/syslog.h>
   88 #include <sys/fstrans.h>
   89 #include <sys/atomic.h>
   90 #include <sys/filedesc.h>
   91 #include <sys/wapbl.h>
   92 #include <sys/mman.h>
   93 
   94 #include <miscfs/specfs/specdev.h>
   95 #include <miscfs/fifofs/fifo.h>
   96 
   97 #include <uvm/uvm_extern.h>
   98 #include <uvm/uvm_readahead.h>
   99 #include <uvm/uvm_device.h>
  100 
  101 #ifdef UNION
  102 #include <fs/union/union.h>
  103 #endif
  104 
  105 #ifndef COMPAT_ZERODEV
  106 #define COMPAT_ZERODEV(dev)     (0)
  107 #endif
  108 
  109 int (*vn_union_readdir_hook)(struct vnode **, struct file *, struct lwp *);
  110 
  111 #include <sys/verified_exec.h>
  112 
  113 static int vn_read(file_t *fp, off_t *offset, struct uio *uio,
  114     kauth_cred_t cred, int flags);
  115 static int vn_write(file_t *fp, off_t *offset, struct uio *uio,
  116     kauth_cred_t cred, int flags);
  117 static int vn_closefile(file_t *fp);
  118 static int vn_poll(file_t *fp, int events);
  119 static int vn_fcntl(file_t *fp, u_int com, void *data);
  120 static int vn_statfile(file_t *fp, struct stat *sb);
  121 static int vn_ioctl(file_t *fp, u_long com, void *data);
  122 static int vn_mmap(struct file *, off_t *, size_t, int, int *, int *,
  123     struct uvm_object **, int *);
  124 static int vn_seek(struct file *, off_t, int, off_t *, int);
  125 
  126 const struct fileops vnops = {
  127         .fo_name = "vn",
  128         .fo_read = vn_read,
  129         .fo_write = vn_write,
  130         .fo_ioctl = vn_ioctl,
  131         .fo_fcntl = vn_fcntl,
  132         .fo_poll = vn_poll,
  133         .fo_stat = vn_statfile,
  134         .fo_close = vn_closefile,
  135         .fo_kqfilter = vn_kqfilter,
  136         .fo_restart = fnullop_restart,
  137         .fo_mmap = vn_mmap,
  138         .fo_seek = vn_seek,
  139 };
  140 
  141 /*
  142  * Common code for vnode open operations.
  143  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  144  *
  145  * at_dvp is the directory for openat(), if any.
  146  * pb is the path.
  147  * nmode is additional namei flags, restricted to TRYEMULROOT and NOCHROOT.
  148  * fmode is the open flags, converted from O_* to F*
  149  * cmode is the creation file permissions.
  150  *
  151  * XXX shouldn't cmode be mode_t?
  152  *
  153  * On success produces either a vnode in *ret_vp, or if that is NULL,
  154  * a file descriptor number in ret_fd.
  155  *
  156  * The caller may pass NULL for ret_fd (and ret_domove), in which case
  157  * EOPNOTSUPP will be produced in the cases that would otherwise return
  158  * a file descriptor.
  159  *
  160  * Note that callers that want no-follow behavior should pass
  161  * O_NOFOLLOW in fmode. Neither FOLLOW nor NOFOLLOW in nmode is
  162  * honored.
  163  */
  164 int
  165 vn_open(struct vnode *at_dvp, struct pathbuf *pb,
  166         int nmode, int fmode, int cmode,
  167         struct vnode **ret_vp, bool *ret_domove, int *ret_fd)
  168 {
  169         struct nameidata nd;
  170         struct vnode *vp = NULL;
  171         struct lwp *l = curlwp;
  172         kauth_cred_t cred = l->l_cred;
  173         struct vattr va;
  174         int error;
  175         const char *pathstring;
  176 
  177         KASSERT((nmode & (TRYEMULROOT | NOCHROOT)) == nmode);
  178 
  179         KASSERT(ret_vp != NULL);
  180         KASSERT((ret_domove == NULL) == (ret_fd == NULL));
  181 
  182         if ((fmode & (O_CREAT | O_DIRECTORY)) == (O_CREAT | O_DIRECTORY))
  183                 return EINVAL;
  184 
  185         NDINIT(&nd, LOOKUP, nmode, pb);
  186         if (at_dvp != NULL)
  187                 NDAT(&nd, at_dvp);
  188 
  189         nd.ni_cnd.cn_flags &= TRYEMULROOT | NOCHROOT;
  190 
  191         if (fmode & O_CREAT) {
  192                 nd.ni_cnd.cn_nameiop = CREATE;
  193                 nd.ni_cnd.cn_flags |= LOCKPARENT | LOCKLEAF;
  194                 if ((fmode & O_EXCL) == 0 &&
  195                     ((fmode & O_NOFOLLOW) == 0))
  196                         nd.ni_cnd.cn_flags |= FOLLOW;
  197                 if ((fmode & O_EXCL) == 0)
  198                         nd.ni_cnd.cn_flags |= NONEXCLHACK;
  199         } else {
  200                 nd.ni_cnd.cn_nameiop = LOOKUP;
  201                 nd.ni_cnd.cn_flags |= LOCKLEAF;
  202                 if ((fmode & O_NOFOLLOW) == 0)
  203                         nd.ni_cnd.cn_flags |= FOLLOW;
  204         }
  205 
  206         pathstring = pathbuf_stringcopy_get(nd.ni_pathbuf);
  207         if (pathstring == NULL) {
  208                 return ENOMEM;
  209         }
  210 
  211         /*
  212          * When this "interface" was exposed to do_open() it used
  213          * to initialize l_dupfd to -newfd-1 (thus passing in the
  214          * new file handle number to use)... but nothing in the
  215          * kernel uses that value. So just send 0.
  216          */
  217         l->l_dupfd = 0;
  218 
  219         error = namei(&nd);
  220         if (error)
  221                 goto out;
  222 
  223         vp = nd.ni_vp;
  224 
  225 #if NVERIEXEC > 0
  226         error = veriexec_openchk(l, nd.ni_vp, pathstring, fmode);
  227         if (error) {
  228                 /* We have to release the locks ourselves */
  229                 /*
  230                  * 20210604 dholland passing NONEXCLHACK means we can
  231                  * get ni_dvp == NULL back if ni_vp exists, and we should
  232                  * treat that like the non-O_CREAT case.
  233                  */
  234                 if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
  235                         if (vp == NULL) {
  236                                 vput(nd.ni_dvp);
  237                         } else {
  238                                 VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
  239                                 if (nd.ni_dvp == nd.ni_vp)
  240                                         vrele(nd.ni_dvp);
  241                                 else
  242                                         vput(nd.ni_dvp);
  243                                 nd.ni_dvp = NULL;
  244                                 vput(vp);
  245                         }
  246                 } else {
  247                         vput(vp);
  248                 }
  249                 goto out;
  250         }
  251 #endif /* NVERIEXEC > 0 */
  252 
  253         /*
  254          * 20210604 dholland ditto
  255          */
  256         if ((fmode & O_CREAT) != 0 && nd.ni_dvp != NULL) {
  257                 if (nd.ni_vp == NULL) {
  258                         vattr_null(&va);
  259                         va.va_type = VREG;
  260                         va.va_mode = cmode;
  261                         if (fmode & O_EXCL)
  262                                  va.va_vaflags |= VA_EXCLUSIVE;
  263                         error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp,
  264                                            &nd.ni_cnd, &va);
  265                         if (error) {
  266                                 vput(nd.ni_dvp);
  267                                 goto out;
  268                         }
  269                         fmode &= ~O_TRUNC;
  270                         vp = nd.ni_vp;
  271                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  272                         vput(nd.ni_dvp);
  273                 } else {
  274                         VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
  275                         if (nd.ni_dvp == nd.ni_vp)
  276                                 vrele(nd.ni_dvp);
  277                         else
  278                                 vput(nd.ni_dvp);
  279                         nd.ni_dvp = NULL;
  280                         vp = nd.ni_vp;
  281                         if (fmode & O_EXCL) {
  282                                 error = EEXIST;
  283                                 goto bad;
  284                         }
  285                         fmode &= ~O_CREAT;
  286                 }
  287         } else if ((fmode & O_CREAT) != 0) {
  288                 /*
  289                  * 20210606 dholland passing NONEXCLHACK means this
  290                  * case exists; it is the same as the following one
  291                  * but also needs to do things in the second (exists)
  292                  * half of the following block. (Besides handle
  293                  * ni_dvp, anyway.)
  294                  */
  295                 vp = nd.ni_vp;
  296                 KASSERT((fmode & O_EXCL) == 0);
  297                 fmode &= ~O_CREAT;
  298         } else {
  299                 vp = nd.ni_vp;
  300         }
  301         if (vp->v_type == VSOCK) {
  302                 error = EOPNOTSUPP;
  303                 goto bad;
  304         }
  305         if (nd.ni_vp->v_type == VLNK) {
  306                 error = EFTYPE;
  307                 goto bad;
  308         }
  309 
  310         if ((fmode & O_CREAT) == 0) {
  311                 error = vn_openchk(vp, cred, fmode);
  312                 if (error != 0)
  313                         goto bad;
  314         }
  315 
  316         if (fmode & O_TRUNC) {
  317                 vattr_null(&va);
  318                 va.va_size = 0;
  319                 error = VOP_SETATTR(vp, &va, cred);
  320                 if (error != 0)
  321                         goto bad;
  322         }
  323         if ((error = VOP_OPEN(vp, fmode, cred)) != 0)
  324                 goto bad;
  325         if (fmode & FWRITE) {
  326                 mutex_enter(vp->v_interlock);
  327                 vp->v_writecount++;
  328                 mutex_exit(vp->v_interlock);
  329         }
  330 
  331 bad:
  332         if (error)
  333                 vput(vp);
  334 out:
  335         pathbuf_stringcopy_put(nd.ni_pathbuf, pathstring);
  336 
  337         switch (error) {
  338         case EDUPFD:
  339         case EMOVEFD:
  340                 /* if the caller isn't prepared to handle fds, fail for them */
  341                 if (ret_fd == NULL) {
  342                         error = EOPNOTSUPP;
  343                         break;
  344                 }
  345                 *ret_vp = NULL;
  346                 *ret_domove = error == EMOVEFD;
  347                 *ret_fd = l->l_dupfd;
  348                 error = 0;
  349                 break;
  350         case 0:
  351                 *ret_vp = vp;
  352                 break;
  353         }
  354         l->l_dupfd = 0;
  355         return error;
  356 }
  357 
  358 /*
  359  * Check for write permissions on the specified vnode.
  360  * Prototype text segments cannot be written.
  361  */
  362 int
  363 vn_writechk(struct vnode *vp)
  364 {
  365 
  366         /*
  367          * If the vnode is in use as a process's text,
  368          * we can't allow writing.
  369          */
  370         if (vp->v_iflag & VI_TEXT)
  371                 return ETXTBSY;
  372         return 0;
  373 }
  374 
  375 int
  376 vn_openchk(struct vnode *vp, kauth_cred_t cred, int fflags)
  377 {
  378         int permbits = 0;
  379         int error;
  380 
  381         if (vp->v_type == VNON || vp->v_type == VBAD)
  382                 return ENXIO;
  383 
  384         if ((fflags & O_DIRECTORY) != 0 && vp->v_type != VDIR)
  385                 return ENOTDIR;
  386 
  387         if ((fflags & O_REGULAR) != 0 && vp->v_type != VREG)
  388                 return EFTYPE;
  389 
  390         if ((fflags & FREAD) != 0) {
  391                 permbits = VREAD;
  392         }
  393         if ((fflags & FEXEC) != 0) {
  394                 permbits |= VEXEC;
  395         }
  396         if ((fflags & (FWRITE | O_TRUNC)) != 0) {
  397                 permbits |= VWRITE;
  398                 if (vp->v_type == VDIR) {
  399                         error = EISDIR;
  400                         goto bad;
  401                 }
  402                 error = vn_writechk(vp);
  403                 if (error != 0)
  404                         goto bad;
  405         }
  406         error = VOP_ACCESS(vp, permbits, cred);
  407 bad:
  408         return error;
  409 }
  410 
  411 /*
  412  * Mark a vnode as having executable mappings.
  413  */
  414 void
  415 vn_markexec(struct vnode *vp)
  416 {
  417 
  418         if ((vp->v_iflag & VI_EXECMAP) != 0) {
  419                 /* Safe unlocked, as long as caller holds a reference. */
  420                 return;
  421         }
  422 
  423         rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
  424         mutex_enter(vp->v_interlock);
  425         if ((vp->v_iflag & VI_EXECMAP) == 0) {
  426                 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
  427                 vp->v_iflag |= VI_EXECMAP;
  428         }
  429         mutex_exit(vp->v_interlock);
  430         rw_exit(vp->v_uobj.vmobjlock);
  431 }
  432 
  433 /*
  434  * Mark a vnode as being the text of a process.
  435  * Fail if the vnode is currently writable.
  436  */
  437 int
  438 vn_marktext(struct vnode *vp)
  439 {
  440 
  441         if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP)) == (VI_TEXT|VI_EXECMAP)) {
  442                 /* Safe unlocked, as long as caller holds a reference. */
  443                 return 0;
  444         }
  445 
  446         rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
  447         mutex_enter(vp->v_interlock);
  448         if (vp->v_writecount != 0) {
  449                 KASSERT((vp->v_iflag & VI_TEXT) == 0);
  450                 mutex_exit(vp->v_interlock);
  451                 rw_exit(vp->v_uobj.vmobjlock);
  452                 return ETXTBSY;
  453         }
  454         if ((vp->v_iflag & VI_EXECMAP) == 0) {
  455                 cpu_count(CPU_COUNT_EXECPAGES, vp->v_uobj.uo_npages);
  456         }
  457         vp->v_iflag |= (VI_TEXT | VI_EXECMAP);
  458         mutex_exit(vp->v_interlock);
  459         rw_exit(vp->v_uobj.vmobjlock);
  460         return 0;
  461 }
  462 
  463 /*
  464  * Vnode close call
  465  *
  466  * Note: takes an unlocked vnode, while VOP_CLOSE takes a locked node.
  467  */
  468 int
  469 vn_close(struct vnode *vp, int flags, kauth_cred_t cred)
  470 {
  471         int error;
  472 
  473         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  474         if (flags & FWRITE) {
  475                 mutex_enter(vp->v_interlock);
  476                 KASSERT(vp->v_writecount > 0);
  477                 vp->v_writecount--;
  478                 mutex_exit(vp->v_interlock);
  479         }
  480         error = VOP_CLOSE(vp, flags, cred);
  481         vput(vp);
  482         return error;
  483 }
  484 
  485 static int
  486 enforce_rlimit_fsize(struct vnode *vp, struct uio *uio, int ioflag)
  487 {
  488         struct lwp *l = curlwp;
  489         off_t testoff;
  490 
  491         if (uio->uio_rw != UIO_WRITE || vp->v_type != VREG)
  492                 return 0;
  493 
  494         KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
  495         if (ioflag & IO_APPEND)
  496                 testoff = vp->v_size;
  497         else
  498                 testoff = uio->uio_offset;
  499 
  500         if (testoff + uio->uio_resid >
  501             l->l_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
  502                 mutex_enter(&proc_lock);
  503                 psignal(l->l_proc, SIGXFSZ);
  504                 mutex_exit(&proc_lock);
  505                 return EFBIG;
  506         }
  507 
  508         return 0;
  509 }
  510 
  511 /*
  512  * Package up an I/O request on a vnode into a uio and do it.
  513  */
  514 int
  515 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
  516     enum uio_seg segflg, int ioflg, kauth_cred_t cred, size_t *aresid,
  517     struct lwp *l)
  518 {
  519         struct uio auio;
  520         struct iovec aiov;
  521         int error;
  522 
  523         if ((ioflg & IO_NODELOCKED) == 0) {
  524                 if (rw == UIO_READ) {
  525                         vn_lock(vp, LK_SHARED | LK_RETRY);
  526                 } else /* UIO_WRITE */ {
  527                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  528                 }
  529         }
  530         auio.uio_iov = &aiov;
  531         auio.uio_iovcnt = 1;
  532         aiov.iov_base = base;
  533         aiov.iov_len = len;
  534         auio.uio_resid = len;
  535         auio.uio_offset = offset;
  536         auio.uio_rw = rw;
  537         if (segflg == UIO_SYSSPACE) {
  538                 UIO_SETUP_SYSSPACE(&auio);
  539         } else {
  540                 auio.uio_vmspace = l->l_proc->p_vmspace;
  541         }
  542 
  543         if ((error = enforce_rlimit_fsize(vp, &auio, ioflg)) != 0)
  544                 goto out;
  545 
  546         if (rw == UIO_READ) {
  547                 error = VOP_READ(vp, &auio, ioflg, cred);
  548         } else {
  549                 error = VOP_WRITE(vp, &auio, ioflg, cred);
  550         }
  551 
  552         if (aresid)
  553                 *aresid = auio.uio_resid;
  554         else
  555                 if (auio.uio_resid && error == 0)
  556                         error = EIO;
  557 
  558  out:
  559         if ((ioflg & IO_NODELOCKED) == 0) {
  560                 VOP_UNLOCK(vp);
  561         }
  562         return error;
  563 }
  564 
  565 int
  566 vn_readdir(file_t *fp, char *bf, int segflg, u_int count, int *done,
  567     struct lwp *l, off_t **cookies, int *ncookies)
  568 {
  569         struct vnode *vp = fp->f_vnode;
  570         struct iovec aiov;
  571         struct uio auio;
  572         int error, eofflag;
  573 
  574         /* Limit the size on any kernel buffers used by VOP_READDIR */
  575         count = uimin(MAXBSIZE, count);
  576 
  577 unionread:
  578         if (vp->v_type != VDIR)
  579                 return EINVAL;
  580         aiov.iov_base = bf;
  581         aiov.iov_len = count;
  582         auio.uio_iov = &aiov;
  583         auio.uio_iovcnt = 1;
  584         auio.uio_rw = UIO_READ;
  585         if (segflg == UIO_SYSSPACE) {
  586                 UIO_SETUP_SYSSPACE(&auio);
  587         } else {
  588                 KASSERT(l == curlwp);
  589                 auio.uio_vmspace = l->l_proc->p_vmspace;
  590         }
  591         auio.uio_resid = count;
  592         vn_lock(vp, LK_SHARED | LK_RETRY);
  593         auio.uio_offset = fp->f_offset;
  594         error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, cookies,
  595                     ncookies);
  596         mutex_enter(&fp->f_lock);
  597         fp->f_offset = auio.uio_offset;
  598         mutex_exit(&fp->f_lock);
  599         VOP_UNLOCK(vp);
  600         if (error)
  601                 return error;
  602 
  603         if (count == auio.uio_resid && vn_union_readdir_hook) {
  604                 struct vnode *ovp = vp;
  605 
  606                 error = (*vn_union_readdir_hook)(&vp, fp, l);
  607                 if (error)
  608                         return error;
  609                 if (vp != ovp)
  610                         goto unionread;
  611         }
  612 
  613         if (count == auio.uio_resid && (vp->v_vflag & VV_ROOT) &&
  614             (vp->v_mount->mnt_flag & MNT_UNION)) {
  615                 struct vnode *tvp = vp;
  616                 vp = vp->v_mount->mnt_vnodecovered;
  617                 vref(vp);
  618                 mutex_enter(&fp->f_lock);
  619                 fp->f_vnode = vp;
  620                 fp->f_offset = 0;
  621                 mutex_exit(&fp->f_lock);
  622                 vrele(tvp);
  623                 goto unionread;
  624         }
  625         *done = count - auio.uio_resid;
  626         return error;
  627 }
  628 
  629 /*
  630  * File table vnode read routine.
  631  */
  632 static int
  633 vn_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
  634     int flags)
  635 {
  636         struct vnode *vp = fp->f_vnode;
  637         int error, ioflag, fflag;
  638         size_t count;
  639 
  640         ioflag = IO_ADV_ENCODE(fp->f_advice);
  641         fflag = fp->f_flag;
  642         if (fflag & FNONBLOCK)
  643                 ioflag |= IO_NDELAY;
  644         if ((fflag & (FFSYNC | FRSYNC)) == (FFSYNC | FRSYNC))
  645                 ioflag |= IO_SYNC;
  646         if (fflag & FALTIO)
  647                 ioflag |= IO_ALTSEMANTICS;
  648         if (fflag & FDIRECT)
  649                 ioflag |= IO_DIRECT;
  650         if (offset == &fp->f_offset && (flags & FOF_UPDATE_OFFSET) != 0)
  651                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  652         else
  653                 vn_lock(vp, LK_SHARED | LK_RETRY);
  654         uio->uio_offset = *offset;
  655         count = uio->uio_resid;
  656         error = VOP_READ(vp, uio, ioflag, cred);
  657         if (flags & FOF_UPDATE_OFFSET)
  658                 *offset += count - uio->uio_resid;
  659         VOP_UNLOCK(vp);
  660         return error;
  661 }
  662 
  663 /*
  664  * File table vnode write routine.
  665  */
  666 static int
  667 vn_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
  668     int flags)
  669 {
  670         struct vnode *vp = fp->f_vnode;
  671         int error, ioflag, fflag;
  672         size_t count;
  673 
  674         ioflag = IO_ADV_ENCODE(fp->f_advice) | IO_UNIT;
  675         fflag = fp->f_flag;
  676         if (vp->v_type == VREG && (fflag & O_APPEND))
  677                 ioflag |= IO_APPEND;
  678         if (fflag & FNONBLOCK)
  679                 ioflag |= IO_NDELAY;
  680         if (fflag & FFSYNC ||
  681             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
  682                 ioflag |= IO_SYNC;
  683         else if (fflag & FDSYNC)
  684                 ioflag |= IO_DSYNC;
  685         if (fflag & FALTIO)
  686                 ioflag |= IO_ALTSEMANTICS;
  687         if (fflag & FDIRECT)
  688                 ioflag |= IO_DIRECT;
  689         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  690         uio->uio_offset = *offset;
  691         count = uio->uio_resid;
  692 
  693         if ((error = enforce_rlimit_fsize(vp, uio, ioflag)) != 0)
  694                 goto out;
  695 
  696         error = VOP_WRITE(vp, uio, ioflag, cred);
  697 
  698         if (flags & FOF_UPDATE_OFFSET) {
  699                 if (ioflag & IO_APPEND) {
  700                         /*
  701                          * SUSv3 describes behaviour for count = 0 as following:
  702                          * "Before any action ... is taken, and if nbyte is zero
  703                          * and the file is a regular file, the write() function
  704                          * ... in the absence of errors ... shall return zero
  705                          * and have no other results."
  706                          */ 
  707                         if (count)
  708                                 *offset = uio->uio_offset;
  709                 } else
  710                         *offset += count - uio->uio_resid;
  711         }
  712 
  713  out:
  714         VOP_UNLOCK(vp);
  715         return error;
  716 }
  717 
  718 /*
  719  * File table vnode stat routine.
  720  */
  721 static int
  722 vn_statfile(file_t *fp, struct stat *sb)
  723 {
  724         struct vnode *vp = fp->f_vnode;
  725         int error;
  726 
  727         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  728         error = vn_stat(vp, sb);
  729         VOP_UNLOCK(vp);
  730         return error;
  731 }
  732 
  733 int
  734 vn_stat(struct vnode *vp, struct stat *sb)
  735 {
  736         struct vattr va;
  737         int error;
  738         mode_t mode;
  739 
  740         memset(&va, 0, sizeof(va));
  741         error = VOP_GETATTR(vp, &va, kauth_cred_get());
  742         if (error)
  743                 return error;
  744         /*
  745          * Copy from vattr table
  746          */
  747         memset(sb, 0, sizeof(*sb));
  748         sb->st_dev = va.va_fsid;
  749         sb->st_ino = va.va_fileid;
  750         mode = va.va_mode;
  751         switch (vp->v_type) {
  752         case VREG:
  753                 mode |= S_IFREG;
  754                 break;
  755         case VDIR:
  756                 mode |= S_IFDIR;
  757                 break;
  758         case VBLK:
  759                 mode |= S_IFBLK;
  760                 break;
  761         case VCHR:
  762                 mode |= S_IFCHR;
  763                 break;
  764         case VLNK:
  765                 mode |= S_IFLNK;
  766                 break;
  767         case VSOCK:
  768                 mode |= S_IFSOCK;
  769                 break;
  770         case VFIFO:
  771                 mode |= S_IFIFO;
  772                 break;
  773         default:
  774                 return EBADF;
  775         }
  776         sb->st_mode = mode;
  777         sb->st_nlink = va.va_nlink;
  778         sb->st_uid = va.va_uid;
  779         sb->st_gid = va.va_gid;
  780         sb->st_rdev = va.va_rdev;
  781         sb->st_size = va.va_size;
  782         sb->st_atimespec = va.va_atime;
  783         sb->st_mtimespec = va.va_mtime;
  784         sb->st_ctimespec = va.va_ctime;
  785         sb->st_birthtimespec = va.va_birthtime;
  786         sb->st_blksize = va.va_blocksize;
  787         sb->st_flags = va.va_flags;
  788         sb->st_gen = 0;
  789         sb->st_blocks = va.va_bytes / S_BLKSIZE;
  790         return 0;
  791 }
  792 
  793 /*
  794  * File table vnode fcntl routine.
  795  */
  796 static int
  797 vn_fcntl(file_t *fp, u_int com, void *data)
  798 {
  799         struct vnode *vp = fp->f_vnode;
  800         int error;
  801 
  802         error = VOP_FCNTL(vp, com, data, fp->f_flag, kauth_cred_get());
  803         return error;
  804 }
  805 
  806 /*
  807  * File table vnode ioctl routine.
  808  */
  809 static int
  810 vn_ioctl(file_t *fp, u_long com, void *data)
  811 {
  812         struct vnode *vp = fp->f_vnode, *ovp;
  813         struct vattr vattr;
  814         int error;
  815 
  816         switch (vp->v_type) {
  817 
  818         case VREG:
  819         case VDIR:
  820                 if (com == FIONREAD) {
  821                         vn_lock(vp, LK_SHARED | LK_RETRY);
  822                         error = VOP_GETATTR(vp, &vattr, kauth_cred_get());
  823                         if (error == 0)
  824                                 *(int *)data = vattr.va_size - fp->f_offset;
  825                         VOP_UNLOCK(vp);
  826                         if (error)
  827                                 return error;
  828                         return 0;
  829                 }
  830                 if ((com == FIONWRITE) || (com == FIONSPACE)) {
  831                         /*
  832                          * Files don't have send queues, so there never
  833                          * are any bytes in them, nor is there any
  834                          * open space in them.
  835                          */
  836                         *(int *)data = 0;
  837                         return 0;
  838                 }
  839                 if (com == FIOGETBMAP) {
  840                         daddr_t *block;
  841 
  842                         if (*(daddr_t *)data < 0)
  843                                 return EINVAL;
  844                         block = (daddr_t *)data;
  845                         vn_lock(vp, LK_SHARED | LK_RETRY);
  846                         error = VOP_BMAP(vp, *block, NULL, block, NULL);
  847                         VOP_UNLOCK(vp);
  848                         return error;
  849                 }
  850                 if (com == OFIOGETBMAP) {
  851                         daddr_t ibn, obn;
  852 
  853                         if (*(int32_t *)data < 0)
  854                                 return EINVAL;
  855                         ibn = (daddr_t)*(int32_t *)data;
  856                         vn_lock(vp, LK_SHARED | LK_RETRY);
  857                         error = VOP_BMAP(vp, ibn, NULL, &obn, NULL);
  858                         VOP_UNLOCK(vp);
  859                         *(int32_t *)data = (int32_t)obn;
  860                         return error;
  861                 }
  862                 if (com == FIONBIO || com == FIOASYNC)  /* XXX */
  863                         return 0;                       /* XXX */
  864                 /* FALLTHROUGH */
  865         case VFIFO:
  866         case VCHR:
  867         case VBLK:
  868                 error = VOP_IOCTL(vp, com, data, fp->f_flag,
  869                     kauth_cred_get());
  870                 if (error == 0 && com == TIOCSCTTY) {
  871                         vref(vp);
  872                         mutex_enter(&proc_lock);
  873                         ovp = curproc->p_session->s_ttyvp;
  874                         curproc->p_session->s_ttyvp = vp;
  875                         mutex_exit(&proc_lock);
  876                         if (ovp != NULL)
  877                                 vrele(ovp);
  878                 }
  879                 return error;
  880 
  881         default:
  882                 return EPASSTHROUGH;
  883         }
  884 }
  885 
  886 /*
  887  * File table vnode poll routine.
  888  */
  889 static int
  890 vn_poll(file_t *fp, int events)
  891 {
  892 
  893         return VOP_POLL(fp->f_vnode, events);
  894 }
  895 
  896 /*
  897  * File table vnode kqfilter routine.
  898  */
  899 int
  900 vn_kqfilter(file_t *fp, struct knote *kn)
  901 {
  902 
  903         return VOP_KQFILTER(fp->f_vnode, kn);
  904 }
  905 
  906 static int
  907 vn_mmap(struct file *fp, off_t *offp, size_t size, int prot, int *flagsp,
  908     int *advicep, struct uvm_object **uobjp, int *maxprotp)
  909 {
  910         struct uvm_object *uobj;
  911         struct vnode *vp;
  912         struct vattr va;
  913         struct lwp *l;
  914         vm_prot_t maxprot;
  915         off_t off;
  916         int error, flags;
  917         bool needwritemap;
  918 
  919         l = curlwp;
  920 
  921         off = *offp;
  922         flags = *flagsp;
  923         maxprot = VM_PROT_EXECUTE;
  924 
  925         KASSERT(size > 0);
  926 
  927         vp = fp->f_vnode;
  928         if (vp->v_type != VREG && vp->v_type != VCHR &&
  929             vp->v_type != VBLK) {
  930                 /* only REG/CHR/BLK support mmap */
  931                 return ENODEV;
  932         }
  933         if (vp->v_type != VCHR && off < 0) {
  934                 return EINVAL;
  935         }
  936 #if SIZE_MAX > UINT32_MAX       /* XXX -Wtype-limits */
  937         if (vp->v_type != VCHR && size > __type_max(off_t)) {
  938                 return EOVERFLOW;
  939         }
  940 #endif
  941         if (vp->v_type != VCHR && off > __type_max(off_t) - size) {
  942                 /* no offset wrapping */
  943                 return EOVERFLOW;
  944         }
  945 
  946         /* special case: catch SunOS style /dev/zero */
  947         if (vp->v_type == VCHR &&
  948             (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) {
  949                 *uobjp = NULL;
  950                 *maxprotp = VM_PROT_ALL;
  951                 return 0;
  952         }
  953 
  954         /*
  955          * Old programs may not select a specific sharing type, so
  956          * default to an appropriate one.
  957          *
  958          * XXX: how does MAP_ANON fit in the picture?
  959          */
  960         if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) {
  961 #if defined(DEBUG)
  962                 struct proc *p = l->l_proc;
  963                 printf("WARNING: defaulted mmap() share type to "
  964                        "%s (pid %d command %s)\n", vp->v_type == VCHR ?
  965                        "MAP_SHARED" : "MAP_PRIVATE", p->p_pid,
  966                        p->p_comm);
  967 #endif
  968                 if (vp->v_type == VCHR)
  969                         flags |= MAP_SHARED;    /* for a device */
  970                 else
  971                         flags |= MAP_PRIVATE;   /* for a file */
  972         }
  973 
  974         /*
  975          * MAP_PRIVATE device mappings don't make sense (and aren't
  976          * supported anyway).  However, some programs rely on this,
  977          * so just change it to MAP_SHARED.
  978          */
  979         if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) {
  980                 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED;
  981         }
  982 
  983         /*
  984          * now check protection
  985          */
  986 
  987         /* check read access */
  988         if (fp->f_flag & FREAD)
  989                 maxprot |= VM_PROT_READ;
  990         else if (prot & PROT_READ) {
  991                 return EACCES;
  992         }
  993 
  994         /* check write access, shared case first */
  995         if (flags & MAP_SHARED) {
  996                 /*
  997                  * if the file is writable, only add PROT_WRITE to
  998                  * maxprot if the file is not immutable, append-only.
  999                  * otherwise, if we have asked for PROT_WRITE, return
 1000                  * EPERM.
 1001                  */
 1002                 if (fp->f_flag & FWRITE) {
 1003                         vn_lock(vp, LK_SHARED | LK_RETRY);
 1004                         error = VOP_GETATTR(vp, &va, l->l_cred);
 1005                         VOP_UNLOCK(vp);
 1006                         if (error) {
 1007                                 return error;
 1008                         }
 1009                         if ((va.va_flags &
 1010                              (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0)
 1011                                 maxprot |= VM_PROT_WRITE;
 1012                         else if (prot & PROT_WRITE) {
 1013                                 return EPERM;
 1014                         }
 1015                 } else if (prot & PROT_WRITE) {
 1016                         return EACCES;
 1017                 }
 1018         } else {
 1019                 /* MAP_PRIVATE mappings can always write to */
 1020                 maxprot |= VM_PROT_WRITE;
 1021         }
 1022 
 1023         /*
 1024          * Don't allow mmap for EXEC if the file system
 1025          * is mounted NOEXEC.
 1026          */
 1027         if ((prot & PROT_EXEC) != 0 &&
 1028             (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
 1029                 return EACCES;
 1030         }
 1031 
 1032         if (vp->v_type != VCHR) {
 1033                 error = VOP_MMAP(vp, prot, curlwp->l_cred);
 1034                 if (error) {
 1035                         return error;
 1036                 }
 1037                 vref(vp);
 1038                 uobj = &vp->v_uobj;
 1039 
 1040                 /*
 1041                  * If the vnode is being mapped with PROT_EXEC,
 1042                  * then mark it as text.
 1043                  */
 1044                 if (prot & PROT_EXEC) {
 1045                         vn_markexec(vp);
 1046                 }
 1047         } else {
 1048                 int i = maxprot;
 1049 
 1050                 /*
 1051                  * XXX Some devices don't like to be mapped with
 1052                  * XXX PROT_EXEC or PROT_WRITE, but we don't really
 1053                  * XXX have a better way of handling this, right now
 1054                  */
 1055                 do {
 1056                         uobj = udv_attach(vp->v_rdev,
 1057                                           (flags & MAP_SHARED) ? i :
 1058                                           (i & ~VM_PROT_WRITE), off, size);
 1059                         i--;
 1060                 } while ((uobj == NULL) && (i > 0));
 1061                 if (uobj == NULL) {
 1062                         return EINVAL;
 1063                 }
 1064                 *advicep = UVM_ADV_RANDOM;
 1065         }
 1066 
 1067         /*
 1068          * Set vnode flags to indicate the new kinds of mapping.
 1069          * We take the vnode lock in exclusive mode here to serialize
 1070          * with direct I/O.
 1071          *
 1072          * Safe to check for these flag values without a lock, as
 1073          * long as a reference to the vnode is held.
 1074          */
 1075         needwritemap = (vp->v_iflag & VI_WRMAP) == 0 &&
 1076                 (flags & MAP_SHARED) != 0 &&
 1077                 (maxprot & VM_PROT_WRITE) != 0;
 1078         if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) {
 1079                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1080                 vp->v_vflag |= VV_MAPPED;
 1081                 if (needwritemap) {
 1082                         rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
 1083                         mutex_enter(vp->v_interlock);
 1084                         vp->v_iflag |= VI_WRMAP;
 1085                         mutex_exit(vp->v_interlock);
 1086                         rw_exit(vp->v_uobj.vmobjlock);
 1087                 }
 1088                 VOP_UNLOCK(vp);
 1089         }
 1090 
 1091 #if NVERIEXEC > 0
 1092 
 1093         /*
 1094          * Check if the file can be executed indirectly.
 1095          *
 1096          * XXX: This gives false warnings about "Incorrect access type"
 1097          * XXX: if the mapping is not executable. Harmless, but will be
 1098          * XXX: fixed as part of other changes.
 1099          */
 1100         if (veriexec_verify(l, vp, "(mmap)", VERIEXEC_INDIRECT,
 1101                             NULL)) {
 1102 
 1103                 /*
 1104                  * Don't allow executable mappings if we can't
 1105                  * indirectly execute the file.
 1106                  */
 1107                 if (prot & VM_PROT_EXECUTE) {
 1108                         return EPERM;
 1109                 }
 1110 
 1111                 /*
 1112                  * Strip the executable bit from 'maxprot' to make sure
 1113                  * it can't be made executable later.
 1114                  */
 1115                 maxprot &= ~VM_PROT_EXECUTE;
 1116         }
 1117 #endif /* NVERIEXEC > 0 */
 1118 
 1119         *uobjp = uobj;
 1120         *maxprotp = maxprot;
 1121         *flagsp = flags;
 1122 
 1123         return 0;
 1124 }
 1125 
 1126 static int
 1127 vn_seek(struct file *fp, off_t delta, int whence, off_t *newoffp,
 1128     int flags)
 1129 {
 1130         const off_t OFF_MIN = __type_min(off_t);
 1131         const off_t OFF_MAX = __type_max(off_t);
 1132         kauth_cred_t cred = fp->f_cred;
 1133         off_t oldoff, newoff;
 1134         struct vnode *vp = fp->f_vnode;
 1135         struct vattr vattr;
 1136         int error;
 1137 
 1138         if (vp->v_type == VFIFO)
 1139                 return ESPIPE;
 1140 
 1141         if (flags & FOF_UPDATE_OFFSET)
 1142                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1143         else
 1144                 vn_lock(vp, LK_SHARED | LK_RETRY);
 1145 
 1146         /* Compute the old and new offsets.  */
 1147         oldoff = fp->f_offset;
 1148         switch (whence) {
 1149         case SEEK_CUR:
 1150                 if (delta > 0) {
 1151                         if (oldoff > 0 && delta > OFF_MAX - oldoff) {
 1152                                 newoff = OFF_MAX;
 1153                                 break;
 1154                         }
 1155                 } else {
 1156                         if (oldoff < 0 && delta < OFF_MIN - oldoff) {
 1157                                 newoff = OFF_MIN;
 1158                                 break;
 1159                         }
 1160                 }
 1161                 newoff = oldoff + delta;
 1162                 break;
 1163         case SEEK_END:
 1164                 error = VOP_GETATTR(vp, &vattr, cred);
 1165                 if (error)
 1166                         goto out;
 1167                 if (vattr.va_size > OFF_MAX ||
 1168                     delta > OFF_MAX - (off_t)vattr.va_size) {
 1169                         newoff = OFF_MAX;
 1170                         break;
 1171                 }
 1172                 newoff = delta + vattr.va_size;
 1173                 break;
 1174         case SEEK_SET:
 1175                 newoff = delta;
 1176                 break;
 1177         default:
 1178                 error = EINVAL;
 1179                 goto out;
 1180         }
 1181 
 1182         /* Pass the proposed change to the file system to audit.  */
 1183         error = VOP_SEEK(vp, oldoff, newoff, cred);
 1184         if (error)
 1185                 goto out;
 1186 
 1187         /* Success!  */
 1188         if (newoffp)
 1189                 *newoffp = newoff;
 1190         if (flags & FOF_UPDATE_OFFSET)
 1191                 fp->f_offset = newoff;
 1192         error = 0;
 1193 
 1194 out:    VOP_UNLOCK(vp);
 1195         return error;
 1196 }
 1197 
 1198 /*
 1199  * Check that the vnode is still valid, and if so
 1200  * acquire requested lock.
 1201  */
 1202 int
 1203 vn_lock(struct vnode *vp, int flags)
 1204 {
 1205         struct lwp *l;
 1206         int error;
 1207 
 1208         KASSERT(vrefcnt(vp) > 0);
 1209         KASSERT((flags & ~(LK_SHARED|LK_EXCLUSIVE|LK_NOWAIT|LK_RETRY|
 1210             LK_UPGRADE|LK_DOWNGRADE)) == 0);
 1211         KASSERT((flags & LK_NOWAIT) != 0 || !mutex_owned(vp->v_interlock));
 1212 
 1213 #ifdef DIAGNOSTIC
 1214         if (wapbl_vphaswapbl(vp))
 1215                 WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp));
 1216 #endif
 1217 
 1218         /* Get a more useful report for lockstat. */
 1219         l = curlwp;
 1220         KASSERT(l->l_rwcallsite == 0);
 1221         l->l_rwcallsite = (uintptr_t)__builtin_return_address(0);       
 1222 
 1223         error = VOP_LOCK(vp, flags);
 1224 
 1225         l->l_rwcallsite = 0;
 1226 
 1227         switch (flags & (LK_RETRY | LK_NOWAIT)) {
 1228         case 0:
 1229                 KASSERT(error == 0 || error == ENOENT);
 1230                 break;
 1231         case LK_RETRY:
 1232                 KASSERT(error == 0);
 1233                 break;
 1234         case LK_NOWAIT:
 1235                 KASSERT(error == 0 || error == EBUSY || error == ENOENT);
 1236                 break;
 1237         case LK_RETRY | LK_NOWAIT:
 1238                 KASSERT(error == 0 || error == EBUSY);
 1239                 break;
 1240         }
 1241 
 1242         return error;
 1243 }
 1244 
 1245 /*
 1246  * File table vnode close routine.
 1247  */
 1248 static int
 1249 vn_closefile(file_t *fp)
 1250 {
 1251 
 1252         return vn_close(fp->f_vnode, fp->f_flag, fp->f_cred);
 1253 }
 1254 
 1255 /*
 1256  * Simplified in-kernel wrapper calls for extended attribute access.
 1257  * Both calls pass in a NULL credential, authorizing a "kernel" access.
 1258  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 1259  */
 1260 int
 1261 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 1262     const char *attrname, size_t *buflen, void *bf, struct lwp *l)
 1263 {
 1264         struct uio auio;
 1265         struct iovec aiov;
 1266         int error;
 1267 
 1268         aiov.iov_len = *buflen;
 1269         aiov.iov_base = bf;
 1270 
 1271         auio.uio_iov = &aiov;
 1272         auio.uio_iovcnt = 1;
 1273         auio.uio_rw = UIO_READ;
 1274         auio.uio_offset = 0;
 1275         auio.uio_resid = *buflen;
 1276         UIO_SETUP_SYSSPACE(&auio);
 1277 
 1278         if ((ioflg & IO_NODELOCKED) == 0)
 1279                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1280 
 1281         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL,
 1282             NOCRED);
 1283 
 1284         if ((ioflg & IO_NODELOCKED) == 0)
 1285                 VOP_UNLOCK(vp);
 1286 
 1287         if (error == 0)
 1288                 *buflen = *buflen - auio.uio_resid;
 1289 
 1290         return error;
 1291 }
 1292 
 1293 /*
 1294  * XXX Failure mode if partially written?
 1295  */
 1296 int
 1297 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 1298     const char *attrname, size_t buflen, const void *bf, struct lwp *l)
 1299 {
 1300         struct uio auio;
 1301         struct iovec aiov;
 1302         int error;
 1303 
 1304         aiov.iov_len = buflen;
 1305         aiov.iov_base = __UNCONST(bf);          /* XXXUNCONST kills const */
 1306 
 1307         auio.uio_iov = &aiov;
 1308         auio.uio_iovcnt = 1;
 1309         auio.uio_rw = UIO_WRITE;
 1310         auio.uio_offset = 0;
 1311         auio.uio_resid = buflen;
 1312         UIO_SETUP_SYSSPACE(&auio);
 1313 
 1314         if ((ioflg & IO_NODELOCKED) == 0) {
 1315                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1316         }
 1317 
 1318         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NOCRED);
 1319 
 1320         if ((ioflg & IO_NODELOCKED) == 0) {
 1321                 VOP_UNLOCK(vp);
 1322         }
 1323 
 1324         return error;
 1325 }
 1326 
 1327 int
 1328 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 1329     const char *attrname, struct lwp *l)
 1330 {
 1331         int error;
 1332 
 1333         if ((ioflg & IO_NODELOCKED) == 0) {
 1334                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1335         }
 1336 
 1337         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NOCRED);
 1338         if (error == EOPNOTSUPP)
 1339                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 1340                     NOCRED);
 1341 
 1342         if ((ioflg & IO_NODELOCKED) == 0) {
 1343                 VOP_UNLOCK(vp);
 1344         }
 1345 
 1346         return error;
 1347 }
 1348 
 1349 int
 1350 vn_fifo_bypass(void *v)
 1351 {
 1352         struct vop_generic_args *ap = v;
 1353 
 1354         return VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, v);
 1355 }
 1356 
 1357 /*
 1358  * Open block device by device number
 1359  */
 1360 int
 1361 vn_bdev_open(dev_t dev, struct vnode **vpp, struct lwp *l)
 1362 {
 1363         int     error;
 1364 
 1365         if ((error = bdevvp(dev, vpp)) != 0)
 1366                 return error;
 1367 
 1368         vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
 1369         if ((error = VOP_OPEN(*vpp, FREAD | FWRITE, l->l_cred)) != 0) {
 1370                 vput(*vpp);
 1371                 return error;
 1372         }
 1373         mutex_enter((*vpp)->v_interlock);
 1374         (*vpp)->v_writecount++;
 1375         mutex_exit((*vpp)->v_interlock);
 1376         VOP_UNLOCK(*vpp);
 1377 
 1378         return 0;
 1379 }
 1380 
 1381 /*
 1382  * Lookup the provided name in the filesystem.  If the file exists,
 1383  * is a valid block device, and isn't being used by anyone else,
 1384  * set *vpp to the file's vnode.
 1385  */
 1386 int
 1387 vn_bdev_openpath(struct pathbuf *pb, struct vnode **vpp, struct lwp *l)
 1388 {
 1389         struct vnode *vp;
 1390         dev_t dev;
 1391         enum vtype vt;
 1392         int     error;
 1393 
 1394         error = vn_open(NULL, pb, 0, FREAD | FWRITE, 0, &vp, NULL, NULL);
 1395         if (error != 0)
 1396                 return error;
 1397 
 1398         dev = vp->v_rdev;
 1399         vt = vp->v_type;
 1400 
 1401         VOP_UNLOCK(vp);
 1402         (void) vn_close(vp, FREAD | FWRITE, l->l_cred);
 1403 
 1404         if (vt != VBLK)
 1405                 return ENOTBLK;
 1406 
 1407         return vn_bdev_open(dev, vpp, l);
 1408 }
 1409 
 1410 static long
 1411 vn_knote_to_interest(const struct knote *kn)
 1412 {
 1413         switch (kn->kn_filter) {
 1414         case EVFILT_READ:
 1415                 /*
 1416                  * Writing to the file or changing its attributes can
 1417                  * set the file size, which impacts the readability
 1418                  * filter.
 1419                  *
 1420                  * (No need to set NOTE_EXTEND here; it's only ever
 1421                  * send with other hints; see vnode_if.c.)
 1422                  */
 1423                 return NOTE_WRITE | NOTE_ATTRIB;
 1424 
 1425         case EVFILT_VNODE:
 1426                 return kn->kn_sfflags;
 1427 
 1428         case EVFILT_WRITE:
 1429         default:
 1430                 return 0;
 1431         }
 1432 }
 1433 
 1434 void
 1435 vn_knote_attach(struct vnode *vp, struct knote *kn)
 1436 {
 1437         struct vnode_klist *vk = vp->v_klist;
 1438         long interest = 0;
 1439 
 1440         /*
 1441          * In the case of layered / stacked file systems, knotes
 1442          * should only ever be associated with the base vnode.
 1443          */
 1444         KASSERT(kn->kn_hook == vp);
 1445         KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
 1446 
 1447         /*
 1448          * We maintain a bitmask of the kevents that there is interest in,
 1449          * to minimize the impact of having watchers.  It's silly to have
 1450          * to traverse vn_klist every time a read or write happens simply
 1451          * because there is someone interested in knowing when the file
 1452          * is deleted, for example.
 1453          */
 1454 
 1455         mutex_enter(vp->v_interlock);
 1456         SLIST_INSERT_HEAD(&vk->vk_klist, kn, kn_selnext);
 1457         SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
 1458                 interest |= vn_knote_to_interest(kn);
 1459         }
 1460         vk->vk_interest = interest;
 1461         mutex_exit(vp->v_interlock);
 1462 }
 1463 
 1464 void
 1465 vn_knote_detach(struct vnode *vp, struct knote *kn)
 1466 {
 1467         struct vnode_klist *vk = vp->v_klist;
 1468         long interest = 0;
 1469 
 1470         /* See above. */
 1471         KASSERT(kn->kn_hook == vp);
 1472         KASSERT(vp->v_klist == &VNODE_TO_VIMPL(vp)->vi_klist);
 1473 
 1474         /*
 1475          * We special case removing the head of the list, because:
 1476          *
 1477          * 1. It's extremely likely that we're detaching the only
 1478          *    knote.
 1479          *
 1480          * 2. We're already traversing the whole list, so we don't
 1481          *    want to use the generic SLIST_REMOVE() which would
 1482          *    traverse it *again*.
 1483          */
 1484 
 1485         mutex_enter(vp->v_interlock);
 1486         if (__predict_true(kn == SLIST_FIRST(&vk->vk_klist))) {
 1487                 SLIST_REMOVE_HEAD(&vk->vk_klist, kn_selnext);
 1488                 SLIST_FOREACH(kn, &vk->vk_klist, kn_selnext) {
 1489                         interest |= vn_knote_to_interest(kn);
 1490                 }
 1491                 vk->vk_interest = interest;
 1492         } else {
 1493                 struct knote *thiskn, *nextkn, *prevkn = NULL;
 1494 
 1495                 SLIST_FOREACH_SAFE(thiskn, &vk->vk_klist, kn_selnext, nextkn) {
 1496                         if (thiskn == kn) {
 1497                                 KASSERT(kn != NULL);
 1498                                 KASSERT(prevkn != NULL);
 1499                                 SLIST_REMOVE_AFTER(prevkn, kn_selnext);
 1500                                 kn = NULL;
 1501                         } else {
 1502                                 interest |= vn_knote_to_interest(thiskn);
 1503                                 prevkn = thiskn;
 1504                         }
 1505                 }
 1506                 vk->vk_interest = interest;
 1507         }
 1508         mutex_exit(vp->v_interlock);
 1509 }
Cache object: dd46df5798417717e5a65836d857a9f7
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c