The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1989, 1993
    5  *      The Regents of the University of California.  All rights reserved.
    6  * (c) UNIX System Laboratories, Inc.
    7  * All or some portions of this file are derived from material licensed
    8  * to the University of California by American Telephone and Telegraph
    9  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
   10  * the permission of UNIX System Laboratories, Inc.
   11  *
   12  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
   13  * Copyright (c) 2013, 2014 The FreeBSD Foundation
   14  *
   15  * Portions of this software were developed by Konstantin Belousov
   16  * under sponsorship from the FreeBSD Foundation.
   17  *
   18  * Redistribution and use in source and binary forms, with or without
   19  * modification, are permitted provided that the following conditions
   20  * are met:
   21  * 1. Redistributions of source code must retain the above copyright
   22  *    notice, this list of conditions and the following disclaimer.
   23  * 2. Redistributions in binary form must reproduce the above copyright
   24  *    notice, this list of conditions and the following disclaimer in the
   25  *    documentation and/or other materials provided with the distribution.
   26  * 3. Neither the name of the University nor the names of its contributors
   27  *    may be used to endorse or promote products derived from this software
   28  *    without specific prior written permission.
   29  *
   30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   40  * SUCH DAMAGE.
   41  *
   42  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
   43  */
   44 
   45 #include <sys/cdefs.h>
   46 __FBSDID("$FreeBSD$");
   47 
   48 #include "opt_hwpmc_hooks.h"
   49 
   50 #include <sys/param.h>
   51 #include <sys/systm.h>
   52 #include <sys/disk.h>
   53 #include <sys/fail.h>
   54 #include <sys/fcntl.h>
   55 #include <sys/file.h>
   56 #include <sys/kdb.h>
   57 #include <sys/ktr.h>
   58 #include <sys/stat.h>
   59 #include <sys/priv.h>
   60 #include <sys/proc.h>
   61 #include <sys/limits.h>
   62 #include <sys/lock.h>
   63 #include <sys/mman.h>
   64 #include <sys/mount.h>
   65 #include <sys/mutex.h>
   66 #include <sys/namei.h>
   67 #include <sys/vnode.h>
   68 #include <sys/bio.h>
   69 #include <sys/buf.h>
   70 #include <sys/filio.h>
   71 #include <sys/resourcevar.h>
   72 #include <sys/rwlock.h>
   73 #include <sys/prng.h>
   74 #include <sys/sx.h>
   75 #include <sys/sleepqueue.h>
   76 #include <sys/sysctl.h>
   77 #include <sys/ttycom.h>
   78 #include <sys/conf.h>
   79 #include <sys/syslog.h>
   80 #include <sys/unistd.h>
   81 #include <sys/user.h>
   82 #include <sys/ktrace.h>
   83 
   84 #include <security/audit/audit.h>
   85 #include <security/mac/mac_framework.h>
   86 
   87 #include <vm/vm.h>
   88 #include <vm/vm_extern.h>
   89 #include <vm/pmap.h>
   90 #include <vm/vm_map.h>
   91 #include <vm/vm_object.h>
   92 #include <vm/vm_page.h>
   93 #include <vm/vm_pager.h>
   94 
   95 #ifdef HWPMC_HOOKS
   96 #include <sys/pmckern.h>
   97 #endif
   98 
   99 static fo_rdwr_t        vn_read;
  100 static fo_rdwr_t        vn_write;
  101 static fo_rdwr_t        vn_io_fault;
  102 static fo_truncate_t    vn_truncate;
  103 static fo_ioctl_t       vn_ioctl;
  104 static fo_poll_t        vn_poll;
  105 static fo_kqfilter_t    vn_kqfilter;
  106 static fo_close_t       vn_closefile;
  107 static fo_mmap_t        vn_mmap;
  108 static fo_fallocate_t   vn_fallocate;
  109 static fo_fspacectl_t   vn_fspacectl;
  110 
  111 struct  fileops vnops = {
  112         .fo_read = vn_io_fault,
  113         .fo_write = vn_io_fault,
  114         .fo_truncate = vn_truncate,
  115         .fo_ioctl = vn_ioctl,
  116         .fo_poll = vn_poll,
  117         .fo_kqfilter = vn_kqfilter,
  118         .fo_stat = vn_statfile,
  119         .fo_close = vn_closefile,
  120         .fo_chmod = vn_chmod,
  121         .fo_chown = vn_chown,
  122         .fo_sendfile = vn_sendfile,
  123         .fo_seek = vn_seek,
  124         .fo_fill_kinfo = vn_fill_kinfo,
  125         .fo_mmap = vn_mmap,
  126         .fo_fallocate = vn_fallocate,
  127         .fo_fspacectl = vn_fspacectl,
  128         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
  129 };
  130 
  131 const u_int io_hold_cnt = 16;
  132 static int vn_io_fault_enable = 1;
  133 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
  134     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
  135 static int vn_io_fault_prefault = 0;
  136 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
  137     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
  138 static int vn_io_pgcache_read_enable = 1;
  139 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
  140     &vn_io_pgcache_read_enable, 0,
  141     "Enable copying from page cache for reads, avoiding fs");
  142 static u_long vn_io_faults_cnt;
  143 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
  144     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
  145 
  146 static int vfs_allow_read_dir = 0;
  147 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
  148     &vfs_allow_read_dir, 0,
  149     "Enable read(2) of directory by root for filesystems that support it");
  150 
  151 /*
  152  * Returns true if vn_io_fault mode of handling the i/o request should
  153  * be used.
  154  */
  155 static bool
  156 do_vn_io_fault(struct vnode *vp, struct uio *uio)
  157 {
  158         struct mount *mp;
  159 
  160         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
  161             (mp = vp->v_mount) != NULL &&
  162             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
  163 }
  164 
  165 /*
  166  * Structure used to pass arguments to vn_io_fault1(), to do either
  167  * file- or vnode-based I/O calls.
  168  */
  169 struct vn_io_fault_args {
  170         enum {
  171                 VN_IO_FAULT_FOP,
  172                 VN_IO_FAULT_VOP
  173         } kind;
  174         struct ucred *cred;
  175         int flags;
  176         union {
  177                 struct fop_args_tag {
  178                         struct file *fp;
  179                         fo_rdwr_t *doio;
  180                 } fop_args;
  181                 struct vop_args_tag {
  182                         struct vnode *vp;
  183                 } vop_args;
  184         } args;
  185 };
  186 
  187 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
  188     struct vn_io_fault_args *args, struct thread *td);
  189 
  190 int
  191 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
  192 {
  193         struct thread *td = curthread;
  194 
  195         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
  196 }
  197 
  198 static uint64_t
  199 open2nameif(int fmode, u_int vn_open_flags)
  200 {
  201         uint64_t res;
  202 
  203         res = ISOPEN | LOCKLEAF;
  204         if ((fmode & O_RESOLVE_BENEATH) != 0)
  205                 res |= RBENEATH;
  206         if ((fmode & O_EMPTY_PATH) != 0)
  207                 res |= EMPTYPATH;
  208         if ((fmode & FREAD) != 0)
  209                 res |= OPENREAD;
  210         if ((fmode & FWRITE) != 0)
  211                 res |= OPENWRITE;
  212         if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
  213                 res |= AUDITVNODE1;
  214         if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
  215                 res |= NOCAPCHECK;
  216         if ((vn_open_flags & VN_OPEN_WANTIOCTLCAPS) != 0)
  217                 res |= WANTIOCTLCAPS;
  218         return (res);
  219 }
  220 
  221 /*
  222  * Common code for vnode open operations via a name lookup.
  223  * Lookup the vnode and invoke VOP_CREATE if needed.
  224  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  225  *
  226  * Note that this does NOT free nameidata for the successful case,
  227  * due to the NDINIT being done elsewhere.
  228  */
  229 int
  230 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
  231     struct ucred *cred, struct file *fp)
  232 {
  233         struct vnode *vp;
  234         struct mount *mp;
  235         struct vattr vat;
  236         struct vattr *vap = &vat;
  237         int fmode, error;
  238         bool first_open;
  239 
  240 restart:
  241         first_open = false;
  242         fmode = *flagp;
  243         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
  244             O_EXCL | O_DIRECTORY) ||
  245             (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
  246                 return (EINVAL);
  247         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
  248                 ndp->ni_cnd.cn_nameiop = CREATE;
  249                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
  250                 /*
  251                  * Set NOCACHE to avoid flushing the cache when
  252                  * rolling in many files at once.
  253                  *
  254                  * Set NC_KEEPPOSENTRY to keep positive entries if they already
  255                  * exist despite NOCACHE.
  256                  */
  257                 ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
  258                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
  259                         ndp->ni_cnd.cn_flags |= FOLLOW;
  260                 if ((vn_open_flags & VN_OPEN_INVFS) == 0)
  261                         bwillwrite();
  262                 if ((error = namei(ndp)) != 0)
  263                         return (error);
  264                 if (ndp->ni_vp == NULL) {
  265                         VATTR_NULL(vap);
  266                         vap->va_type = VREG;
  267                         vap->va_mode = cmode;
  268                         if (fmode & O_EXCL)
  269                                 vap->va_vaflags |= VA_EXCLUSIVE;
  270                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
  271                                 NDFREE_PNBUF(ndp);
  272                                 vput(ndp->ni_dvp);
  273                                 if ((error = vn_start_write(NULL, &mp,
  274                                     V_XSLEEP | V_PCATCH)) != 0)
  275                                         return (error);
  276                                 NDREINIT(ndp);
  277                                 goto restart;
  278                         }
  279                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
  280                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
  281 #ifdef MAC
  282                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
  283                             &ndp->ni_cnd, vap);
  284                         if (error == 0)
  285 #endif
  286                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
  287                                     &ndp->ni_cnd, vap);
  288                         vp = ndp->ni_vp;
  289                         if (error == 0 && (fmode & O_EXCL) != 0 &&
  290                             (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
  291                                 VI_LOCK(vp);
  292                                 vp->v_iflag |= VI_FOPENING;
  293                                 VI_UNLOCK(vp);
  294                                 first_open = true;
  295                         }
  296                         VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
  297                             false);
  298                         vn_finished_write(mp);
  299                         if (error) {
  300                                 NDFREE_PNBUF(ndp);
  301                                 if (error == ERELOOKUP) {
  302                                         NDREINIT(ndp);
  303                                         goto restart;
  304                                 }
  305                                 return (error);
  306                         }
  307                         fmode &= ~O_TRUNC;
  308                 } else {
  309                         if (ndp->ni_dvp == ndp->ni_vp)
  310                                 vrele(ndp->ni_dvp);
  311                         else
  312                                 vput(ndp->ni_dvp);
  313                         ndp->ni_dvp = NULL;
  314                         vp = ndp->ni_vp;
  315                         if (fmode & O_EXCL) {
  316                                 error = EEXIST;
  317                                 goto bad;
  318                         }
  319                         if (vp->v_type == VDIR) {
  320                                 error = EISDIR;
  321                                 goto bad;
  322                         }
  323                         fmode &= ~O_CREAT;
  324                 }
  325         } else {
  326                 ndp->ni_cnd.cn_nameiop = LOOKUP;
  327                 ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
  328                 ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
  329                     FOLLOW;
  330                 if ((fmode & FWRITE) == 0)
  331                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
  332                 if ((error = namei(ndp)) != 0)
  333                         return (error);
  334                 vp = ndp->ni_vp;
  335         }
  336         error = vn_open_vnode(vp, fmode, cred, curthread, fp);
  337         if (first_open) {
  338                 VI_LOCK(vp);
  339                 vp->v_iflag &= ~VI_FOPENING;
  340                 wakeup(vp);
  341                 VI_UNLOCK(vp);
  342         }
  343         if (error)
  344                 goto bad;
  345         *flagp = fmode;
  346         return (0);
  347 bad:
  348         NDFREE_PNBUF(ndp);
  349         vput(vp);
  350         *flagp = fmode;
  351         ndp->ni_vp = NULL;
  352         return (error);
  353 }
  354 
  355 static int
  356 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
  357 {
  358         struct flock lf;
  359         int error, lock_flags, type;
  360 
  361         ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
  362         if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
  363                 return (0);
  364         KASSERT(fp != NULL, ("open with flock requires fp"));
  365         if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
  366                 return (EOPNOTSUPP);
  367 
  368         lock_flags = VOP_ISLOCKED(vp);
  369         VOP_UNLOCK(vp);
  370 
  371         lf.l_whence = SEEK_SET;
  372         lf.l_start = 0;
  373         lf.l_len = 0;
  374         lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
  375         type = F_FLOCK;
  376         if ((fmode & FNONBLOCK) == 0)
  377                 type |= F_WAIT;
  378         if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
  379                 type |= F_FIRSTOPEN;
  380         error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
  381         if (error == 0)
  382                 fp->f_flag |= FHASLOCK;
  383 
  384         vn_lock(vp, lock_flags | LK_RETRY);
  385         return (error);
  386 }
  387 
  388 /*
  389  * Common code for vnode open operations once a vnode is located.
  390  * Check permissions, and call the VOP_OPEN routine.
  391  */
  392 int
  393 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
  394     struct thread *td, struct file *fp)
  395 {
  396         accmode_t accmode;
  397         int error;
  398 
  399         if (vp->v_type == VLNK) {
  400                 if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
  401                         return (EMLINK);
  402         }
  403         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
  404                 return (ENOTDIR);
  405 
  406         accmode = 0;
  407         if ((fmode & O_PATH) == 0) {
  408                 if (vp->v_type == VSOCK)
  409                         return (EOPNOTSUPP);
  410                 if ((fmode & (FWRITE | O_TRUNC)) != 0) {
  411                         if (vp->v_type == VDIR)
  412                                 return (EISDIR);
  413                         accmode |= VWRITE;
  414                 }
  415                 if ((fmode & FREAD) != 0)
  416                         accmode |= VREAD;
  417                 if ((fmode & O_APPEND) && (fmode & FWRITE))
  418                         accmode |= VAPPEND;
  419 #ifdef MAC
  420                 if ((fmode & O_CREAT) != 0)
  421                         accmode |= VCREAT;
  422 #endif
  423         }
  424         if ((fmode & FEXEC) != 0)
  425                 accmode |= VEXEC;
  426 #ifdef MAC
  427         if ((fmode & O_VERIFY) != 0)
  428                 accmode |= VVERIFY;
  429         error = mac_vnode_check_open(cred, vp, accmode);
  430         if (error != 0)
  431                 return (error);
  432 
  433         accmode &= ~(VCREAT | VVERIFY);
  434 #endif
  435         if ((fmode & O_CREAT) == 0 && accmode != 0) {
  436                 error = VOP_ACCESS(vp, accmode, cred, td);
  437                 if (error != 0)
  438                         return (error);
  439         }
  440         if ((fmode & O_PATH) != 0) {
  441                 if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
  442                     VOP_ACCESS(vp, VREAD, cred, td) == 0)
  443                         fp->f_flag |= FKQALLOWED;
  444                 return (0);
  445         }
  446 
  447         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
  448                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
  449         error = VOP_OPEN(vp, fmode, cred, td, fp);
  450         if (error != 0)
  451                 return (error);
  452 
  453         error = vn_open_vnode_advlock(vp, fmode, fp);
  454         if (error == 0 && (fmode & FWRITE) != 0) {
  455                 error = VOP_ADD_WRITECOUNT(vp, 1);
  456                 if (error == 0) {
  457                         CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
  458                              __func__, vp, vp->v_writecount);
  459                 }
  460         }
  461 
  462         /*
  463          * Error from advlock or VOP_ADD_WRITECOUNT() still requires
  464          * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
  465          */
  466         if (error != 0) {
  467                 if (fp != NULL) {
  468                         /*
  469                          * Arrange the call by having fdrop() to use
  470                          * vn_closefile().  This is to satisfy
  471                          * filesystems like devfs or tmpfs, which
  472                          * override fo_close().
  473                          */
  474                         fp->f_flag |= FOPENFAILED;
  475                         fp->f_vnode = vp;
  476                         if (fp->f_ops == &badfileops) {
  477                                 fp->f_type = DTYPE_VNODE;
  478                                 fp->f_ops = &vnops;
  479                         }
  480                         vref(vp);
  481                 } else {
  482                         /*
  483                          * If there is no fp, due to kernel-mode open,
  484                          * we can call VOP_CLOSE() now.
  485                          */
  486                         if (vp->v_type != VFIFO && (fmode & FWRITE) != 0 &&
  487                             !MNT_EXTENDED_SHARED(vp->v_mount) &&
  488                             VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
  489                                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
  490                         (void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
  491                             cred, td);
  492                 }
  493         }
  494 
  495         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
  496         return (error);
  497 
  498 }
  499 
  500 /*
  501  * Check for write permissions on the specified vnode.
  502  * Prototype text segments cannot be written.
  503  * It is racy.
  504  */
  505 int
  506 vn_writechk(struct vnode *vp)
  507 {
  508 
  509         ASSERT_VOP_LOCKED(vp, "vn_writechk");
  510         /*
  511          * If there's shared text associated with
  512          * the vnode, try to free it up once.  If
  513          * we fail, we can't allow writing.
  514          */
  515         if (VOP_IS_TEXT(vp))
  516                 return (ETXTBSY);
  517 
  518         return (0);
  519 }
  520 
  521 /*
  522  * Vnode close call
  523  */
  524 static int
  525 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
  526     struct thread *td, bool keep_ref)
  527 {
  528         struct mount *mp;
  529         int error, lock_flags;
  530 
  531         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
  532             MNT_EXTENDED_SHARED(vp->v_mount))
  533                 lock_flags = LK_SHARED;
  534         else
  535                 lock_flags = LK_EXCLUSIVE;
  536 
  537         vn_start_write(vp, &mp, V_WAIT);
  538         vn_lock(vp, lock_flags | LK_RETRY);
  539         AUDIT_ARG_VNODE1(vp);
  540         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
  541                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
  542                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
  543                     __func__, vp, vp->v_writecount);
  544         }
  545         error = VOP_CLOSE(vp, flags, file_cred, td);
  546         if (keep_ref)
  547                 VOP_UNLOCK(vp);
  548         else
  549                 vput(vp);
  550         vn_finished_write(mp);
  551         return (error);
  552 }
  553 
  554 int
  555 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
  556     struct thread *td)
  557 {
  558 
  559         return (vn_close1(vp, flags, file_cred, td, false));
  560 }
  561 
  562 /*
  563  * Heuristic to detect sequential operation.
  564  */
  565 static int
  566 sequential_heuristic(struct uio *uio, struct file *fp)
  567 {
  568         enum uio_rw rw;
  569 
  570         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
  571 
  572         rw = uio->uio_rw;
  573         if (fp->f_flag & FRDAHEAD)
  574                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
  575 
  576         /*
  577          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
  578          * that the first I/O is normally considered to be slightly
  579          * sequential.  Seeking to offset 0 doesn't change sequentiality
  580          * unless previous seeks have reduced f_seqcount to 0, in which
  581          * case offset 0 is not special.
  582          */
  583         if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
  584             uio->uio_offset == fp->f_nextoff[rw]) {
  585                 /*
  586                  * f_seqcount is in units of fixed-size blocks so that it
  587                  * depends mainly on the amount of sequential I/O and not
  588                  * much on the number of sequential I/O's.  The fixed size
  589                  * of 16384 is hard-coded here since it is (not quite) just
  590                  * a magic size that works well here.  This size is more
  591                  * closely related to the best I/O size for real disks than
  592                  * to any block size used by software.
  593                  */
  594                 if (uio->uio_resid >= IO_SEQMAX * 16384)
  595                         fp->f_seqcount[rw] = IO_SEQMAX;
  596                 else {
  597                         fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
  598                         if (fp->f_seqcount[rw] > IO_SEQMAX)
  599                                 fp->f_seqcount[rw] = IO_SEQMAX;
  600                 }
  601                 return (fp->f_seqcount[rw] << IO_SEQSHIFT);
  602         }
  603 
  604         /* Not sequential.  Quickly draw-down sequentiality. */
  605         if (fp->f_seqcount[rw] > 1)
  606                 fp->f_seqcount[rw] = 1;
  607         else
  608                 fp->f_seqcount[rw] = 0;
  609         return (0);
  610 }
  611 
  612 /*
  613  * Package up an I/O request on a vnode into a uio and do it.
  614  */
  615 int
  616 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
  617     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
  618     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
  619 {
  620         struct uio auio;
  621         struct iovec aiov;
  622         struct mount *mp;
  623         struct ucred *cred;
  624         void *rl_cookie;
  625         struct vn_io_fault_args args;
  626         int error, lock_flags;
  627 
  628         if (offset < 0 && vp->v_type != VCHR)
  629                 return (EINVAL);
  630         auio.uio_iov = &aiov;
  631         auio.uio_iovcnt = 1;
  632         aiov.iov_base = base;
  633         aiov.iov_len = len;
  634         auio.uio_resid = len;
  635         auio.uio_offset = offset;
  636         auio.uio_segflg = segflg;
  637         auio.uio_rw = rw;
  638         auio.uio_td = td;
  639         error = 0;
  640 
  641         if ((ioflg & IO_NODELOCKED) == 0) {
  642                 if ((ioflg & IO_RANGELOCKED) == 0) {
  643                         if (rw == UIO_READ) {
  644                                 rl_cookie = vn_rangelock_rlock(vp, offset,
  645                                     offset + len);
  646                         } else if ((ioflg & IO_APPEND) != 0) {
  647                                 rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
  648                         } else {
  649                                 rl_cookie = vn_rangelock_wlock(vp, offset,
  650                                     offset + len);
  651                         }
  652                 } else
  653                         rl_cookie = NULL;
  654                 mp = NULL;
  655                 if (rw == UIO_WRITE) { 
  656                         if (vp->v_type != VCHR &&
  657                             (error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH))
  658                             != 0)
  659                                 goto out;
  660                         lock_flags = vn_lktype_write(mp, vp);
  661                 } else
  662                         lock_flags = LK_SHARED;
  663                 vn_lock(vp, lock_flags | LK_RETRY);
  664         } else
  665                 rl_cookie = NULL;
  666 
  667         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
  668 #ifdef MAC
  669         if ((ioflg & IO_NOMACCHECK) == 0) {
  670                 if (rw == UIO_READ)
  671                         error = mac_vnode_check_read(active_cred, file_cred,
  672                             vp);
  673                 else
  674                         error = mac_vnode_check_write(active_cred, file_cred,
  675                             vp);
  676         }
  677 #endif
  678         if (error == 0) {
  679                 if (file_cred != NULL)
  680                         cred = file_cred;
  681                 else
  682                         cred = active_cred;
  683                 if (do_vn_io_fault(vp, &auio)) {
  684                         args.kind = VN_IO_FAULT_VOP;
  685                         args.cred = cred;
  686                         args.flags = ioflg;
  687                         args.args.vop_args.vp = vp;
  688                         error = vn_io_fault1(vp, &auio, &args, td);
  689                 } else if (rw == UIO_READ) {
  690                         error = VOP_READ(vp, &auio, ioflg, cred);
  691                 } else /* if (rw == UIO_WRITE) */ {
  692                         error = VOP_WRITE(vp, &auio, ioflg, cred);
  693                 }
  694         }
  695         if (aresid)
  696                 *aresid = auio.uio_resid;
  697         else
  698                 if (auio.uio_resid && error == 0)
  699                         error = EIO;
  700         if ((ioflg & IO_NODELOCKED) == 0) {
  701                 VOP_UNLOCK(vp);
  702                 if (mp != NULL)
  703                         vn_finished_write(mp);
  704         }
  705  out:
  706         if (rl_cookie != NULL)
  707                 vn_rangelock_unlock(vp, rl_cookie);
  708         return (error);
  709 }
  710 
  711 /*
  712  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  713  * request is split up into smaller chunks and we try to avoid saturating
  714  * the buffer cache while potentially holding a vnode locked, so we 
  715  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  716  * to give other processes a chance to lock the vnode (either other processes
  717  * core'ing the same binary, or unrelated processes scanning the directory).
  718  */
  719 int
  720 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
  721     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
  722     struct ucred *file_cred, size_t *aresid, struct thread *td)
  723 {
  724         int error = 0;
  725         ssize_t iaresid;
  726 
  727         do {
  728                 int chunk;
  729 
  730                 /*
  731                  * Force `offset' to a multiple of MAXBSIZE except possibly
  732                  * for the first chunk, so that filesystems only need to
  733                  * write full blocks except possibly for the first and last
  734                  * chunks.
  735                  */
  736                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
  737 
  738                 if (chunk > len)
  739                         chunk = len;
  740                 if (rw != UIO_READ && vp->v_type == VREG)
  741                         bwillwrite();
  742                 iaresid = 0;
  743                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
  744                     ioflg, active_cred, file_cred, &iaresid, td);
  745                 len -= chunk;   /* aresid calc already includes length */
  746                 if (error)
  747                         break;
  748                 offset += chunk;
  749                 base = (char *)base + chunk;
  750                 kern_yield(PRI_USER);
  751         } while (len);
  752         if (aresid)
  753                 *aresid = len + iaresid;
  754         return (error);
  755 }
  756 
  757 #if OFF_MAX <= LONG_MAX
  758 off_t
  759 foffset_lock(struct file *fp, int flags)
  760 {
  761         volatile short *flagsp;
  762         off_t res;
  763         short state;
  764 
  765         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  766 
  767         if ((flags & FOF_NOLOCK) != 0)
  768                 return (atomic_load_long(&fp->f_offset));
  769 
  770         /*
  771          * According to McKusick the vn lock was protecting f_offset here.
  772          * It is now protected by the FOFFSET_LOCKED flag.
  773          */
  774         flagsp = &fp->f_vnread_flags;
  775         if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
  776                 return (atomic_load_long(&fp->f_offset));
  777 
  778         sleepq_lock(&fp->f_vnread_flags);
  779         state = atomic_load_16(flagsp);
  780         for (;;) {
  781                 if ((state & FOFFSET_LOCKED) == 0) {
  782                         if (!atomic_fcmpset_acq_16(flagsp, &state,
  783                             FOFFSET_LOCKED))
  784                                 continue;
  785                         break;
  786                 }
  787                 if ((state & FOFFSET_LOCK_WAITING) == 0) {
  788                         if (!atomic_fcmpset_acq_16(flagsp, &state,
  789                             state | FOFFSET_LOCK_WAITING))
  790                                 continue;
  791                 }
  792                 DROP_GIANT();
  793                 sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
  794                 sleepq_wait(&fp->f_vnread_flags, PUSER -1);
  795                 PICKUP_GIANT();
  796                 sleepq_lock(&fp->f_vnread_flags);
  797                 state = atomic_load_16(flagsp);
  798         }
  799         res = atomic_load_long(&fp->f_offset);
  800         sleepq_release(&fp->f_vnread_flags);
  801         return (res);
  802 }
  803 
  804 void
  805 foffset_unlock(struct file *fp, off_t val, int flags)
  806 {
  807         volatile short *flagsp;
  808         short state;
  809 
  810         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  811 
  812         if ((flags & FOF_NOUPDATE) == 0)
  813                 atomic_store_long(&fp->f_offset, val);
  814         if ((flags & FOF_NEXTOFF_R) != 0)
  815                 fp->f_nextoff[UIO_READ] = val;
  816         if ((flags & FOF_NEXTOFF_W) != 0)
  817                 fp->f_nextoff[UIO_WRITE] = val;
  818 
  819         if ((flags & FOF_NOLOCK) != 0)
  820                 return;
  821 
  822         flagsp = &fp->f_vnread_flags;
  823         state = atomic_load_16(flagsp);
  824         if ((state & FOFFSET_LOCK_WAITING) == 0 &&
  825             atomic_cmpset_rel_16(flagsp, state, 0))
  826                 return;
  827 
  828         sleepq_lock(&fp->f_vnread_flags);
  829         MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
  830         MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
  831         fp->f_vnread_flags = 0;
  832         sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
  833         sleepq_release(&fp->f_vnread_flags);
  834 }
  835 #else
  836 off_t
  837 foffset_lock(struct file *fp, int flags)
  838 {
  839         struct mtx *mtxp;
  840         off_t res;
  841 
  842         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  843 
  844         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  845         mtx_lock(mtxp);
  846         if ((flags & FOF_NOLOCK) == 0) {
  847                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
  848                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
  849                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
  850                             "vofflock", 0);
  851                 }
  852                 fp->f_vnread_flags |= FOFFSET_LOCKED;
  853         }
  854         res = fp->f_offset;
  855         mtx_unlock(mtxp);
  856         return (res);
  857 }
  858 
  859 void
  860 foffset_unlock(struct file *fp, off_t val, int flags)
  861 {
  862         struct mtx *mtxp;
  863 
  864         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  865 
  866         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  867         mtx_lock(mtxp);
  868         if ((flags & FOF_NOUPDATE) == 0)
  869                 fp->f_offset = val;
  870         if ((flags & FOF_NEXTOFF_R) != 0)
  871                 fp->f_nextoff[UIO_READ] = val;
  872         if ((flags & FOF_NEXTOFF_W) != 0)
  873                 fp->f_nextoff[UIO_WRITE] = val;
  874         if ((flags & FOF_NOLOCK) == 0) {
  875                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
  876                     ("Lost FOFFSET_LOCKED"));
  877                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
  878                         wakeup(&fp->f_vnread_flags);
  879                 fp->f_vnread_flags = 0;
  880         }
  881         mtx_unlock(mtxp);
  882 }
  883 #endif
  884 
  885 void
  886 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
  887 {
  888 
  889         if ((flags & FOF_OFFSET) == 0)
  890                 uio->uio_offset = foffset_lock(fp, flags);
  891 }
  892 
  893 void
  894 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
  895 {
  896 
  897         if ((flags & FOF_OFFSET) == 0)
  898                 foffset_unlock(fp, uio->uio_offset, flags);
  899 }
  900 
  901 static int
  902 get_advice(struct file *fp, struct uio *uio)
  903 {
  904         struct mtx *mtxp;
  905         int ret;
  906 
  907         ret = POSIX_FADV_NORMAL;
  908         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
  909                 return (ret);
  910 
  911         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  912         mtx_lock(mtxp);
  913         if (fp->f_advice != NULL &&
  914             uio->uio_offset >= fp->f_advice->fa_start &&
  915             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
  916                 ret = fp->f_advice->fa_advice;
  917         mtx_unlock(mtxp);
  918         return (ret);
  919 }
  920 
  921 static int
  922 get_write_ioflag(struct file *fp)
  923 {
  924         int ioflag;
  925         struct mount *mp;
  926         struct vnode *vp;
  927 
  928         ioflag = 0;
  929         vp = fp->f_vnode;
  930         mp = atomic_load_ptr(&vp->v_mount);
  931 
  932         if ((fp->f_flag & O_DIRECT) != 0)
  933                 ioflag |= IO_DIRECT;
  934 
  935         if ((fp->f_flag & O_FSYNC) != 0 ||
  936             (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
  937                 ioflag |= IO_SYNC;
  938 
  939         /*
  940          * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
  941          * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
  942          * fall back to full O_SYNC behavior.
  943          */
  944         if ((fp->f_flag & O_DSYNC) != 0)
  945                 ioflag |= IO_SYNC | IO_DATASYNC;
  946 
  947         return (ioflag);
  948 }
  949 
  950 int
  951 vn_read_from_obj(struct vnode *vp, struct uio *uio)
  952 {
  953         vm_object_t obj;
  954         vm_page_t ma[io_hold_cnt + 2];
  955         off_t off, vsz;
  956         ssize_t resid;
  957         int error, i, j;
  958 
  959         MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
  960         obj = atomic_load_ptr(&vp->v_object);
  961         if (obj == NULL)
  962                 return (EJUSTRETURN);
  963 
  964         /*
  965          * Depends on type stability of vm_objects.
  966          */
  967         vm_object_pip_add(obj, 1);
  968         if ((obj->flags & OBJ_DEAD) != 0) {
  969                 /*
  970                  * Note that object might be already reused from the
  971                  * vnode, and the OBJ_DEAD flag cleared.  This is fine,
  972                  * we recheck for DOOMED vnode state after all pages
  973                  * are busied, and retract then.
  974                  *
  975                  * But we check for OBJ_DEAD to ensure that we do not
  976                  * busy pages while vm_object_terminate_pages()
  977                  * processes the queue.
  978                  */
  979                 error = EJUSTRETURN;
  980                 goto out_pip;
  981         }
  982 
  983         resid = uio->uio_resid;
  984         off = uio->uio_offset;
  985         for (i = 0; resid > 0; i++) {
  986                 MPASS(i < io_hold_cnt + 2);
  987                 ma[i] = vm_page_grab_unlocked(obj, atop(off),
  988                     VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
  989                     VM_ALLOC_NOWAIT);
  990                 if (ma[i] == NULL)
  991                         break;
  992 
  993                 /*
  994                  * Skip invalid pages.  Valid mask can be partial only
  995                  * at EOF, and we clip later.
  996                  */
  997                 if (vm_page_none_valid(ma[i])) {
  998                         vm_page_sunbusy(ma[i]);
  999                         break;
 1000                 }
 1001 
 1002                 resid -= PAGE_SIZE;
 1003                 off += PAGE_SIZE;
 1004         }
 1005         if (i == 0) {
 1006                 error = EJUSTRETURN;
 1007                 goto out_pip;
 1008         }
 1009 
 1010         /*
 1011          * Check VIRF_DOOMED after we busied our pages.  Since
 1012          * vgonel() terminates the vnode' vm_object, it cannot
 1013          * process past pages busied by us.
 1014          */
 1015         if (VN_IS_DOOMED(vp)) {
 1016                 error = EJUSTRETURN;
 1017                 goto out;
 1018         }
 1019 
 1020         resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 1021         if (resid > uio->uio_resid)
 1022                 resid = uio->uio_resid;
 1023 
 1024         /*
 1025          * Unlocked read of vnp_size is safe because truncation cannot
 1026          * pass busied page.  But we load vnp_size into a local
 1027          * variable so that possible concurrent extension does not
 1028          * break calculation.
 1029          */
 1030 #if defined(__powerpc__) && !defined(__powerpc64__)
 1031         vsz = obj->un_pager.vnp.vnp_size;
 1032 #else
 1033         vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 1034 #endif
 1035         if (uio->uio_offset >= vsz) {
 1036                 error = EJUSTRETURN;
 1037                 goto out;
 1038         }
 1039         if (uio->uio_offset + resid > vsz)
 1040                 resid = vsz - uio->uio_offset;
 1041 
 1042         error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 1043 
 1044 out:
 1045         for (j = 0; j < i; j++) {
 1046                 if (error == 0)
 1047                         vm_page_reference(ma[j]);
 1048                 vm_page_sunbusy(ma[j]);
 1049         }
 1050 out_pip:
 1051         vm_object_pip_wakeup(obj);
 1052         if (error != 0)
 1053                 return (error);
 1054         return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 1055 }
 1056 
 1057 /*
 1058  * File table vnode read routine.
 1059  */
 1060 static int
 1061 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
 1062     struct thread *td)
 1063 {
 1064         struct vnode *vp;
 1065         off_t orig_offset;
 1066         int error, ioflag;
 1067         int advice;
 1068 
 1069         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 1070             uio->uio_td, td));
 1071         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 1072         vp = fp->f_vnode;
 1073         ioflag = 0;
 1074         if (fp->f_flag & FNONBLOCK)
 1075                 ioflag |= IO_NDELAY;
 1076         if (fp->f_flag & O_DIRECT)
 1077                 ioflag |= IO_DIRECT;
 1078 
 1079         /*
 1080          * Try to read from page cache.  VIRF_DOOMED check is racy but
 1081          * allows us to avoid unneeded work outright.
 1082          */
 1083         if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 1084             (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 1085                 error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
 1086                 if (error == 0) {
 1087                         fp->f_nextoff[UIO_READ] = uio->uio_offset;
 1088                         return (0);
 1089                 }
 1090                 if (error != EJUSTRETURN)
 1091                         return (error);
 1092         }
 1093 
 1094         advice = get_advice(fp, uio);
 1095         vn_lock(vp, LK_SHARED | LK_RETRY);
 1096 
 1097         switch (advice) {
 1098         case POSIX_FADV_NORMAL:
 1099         case POSIX_FADV_SEQUENTIAL:
 1100         case POSIX_FADV_NOREUSE:
 1101                 ioflag |= sequential_heuristic(uio, fp);
 1102                 break;
 1103         case POSIX_FADV_RANDOM:
 1104                 /* Disable read-ahead for random I/O. */
 1105                 break;
 1106         }
 1107         orig_offset = uio->uio_offset;
 1108 
 1109 #ifdef MAC
 1110         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 1111         if (error == 0)
 1112 #endif
 1113                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 1114         fp->f_nextoff[UIO_READ] = uio->uio_offset;
 1115         VOP_UNLOCK(vp);
 1116         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 1117             orig_offset != uio->uio_offset)
 1118                 /*
 1119                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
 1120                  * for the backing file after a POSIX_FADV_NOREUSE
 1121                  * read(2).
 1122                  */
 1123                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 1124                     POSIX_FADV_DONTNEED);
 1125         return (error);
 1126 }
 1127 
 1128 /*
 1129  * File table vnode write routine.
 1130  */
 1131 static int
 1132 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
 1133     struct thread *td)
 1134 {
 1135         struct vnode *vp;
 1136         struct mount *mp;
 1137         off_t orig_offset;
 1138         int error, ioflag;
 1139         int advice;
 1140         bool need_finished_write;
 1141 
 1142         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 1143             uio->uio_td, td));
 1144         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 1145         vp = fp->f_vnode;
 1146         if (vp->v_type == VREG)
 1147                 bwillwrite();
 1148         ioflag = IO_UNIT;
 1149         if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
 1150                 ioflag |= IO_APPEND;
 1151         if ((fp->f_flag & FNONBLOCK) != 0)
 1152                 ioflag |= IO_NDELAY;
 1153         ioflag |= get_write_ioflag(fp);
 1154 
 1155         mp = NULL;
 1156         need_finished_write = false;
 1157         if (vp->v_type != VCHR) {
 1158                 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 1159                 if (error != 0)
 1160                         goto unlock;
 1161                 need_finished_write = true;
 1162         }
 1163 
 1164         advice = get_advice(fp, uio);
 1165 
 1166         vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 1167         switch (advice) {
 1168         case POSIX_FADV_NORMAL:
 1169         case POSIX_FADV_SEQUENTIAL:
 1170         case POSIX_FADV_NOREUSE:
 1171                 ioflag |= sequential_heuristic(uio, fp);
 1172                 break;
 1173         case POSIX_FADV_RANDOM:
 1174                 /* XXX: Is this correct? */
 1175                 break;
 1176         }
 1177         orig_offset = uio->uio_offset;
 1178 
 1179 #ifdef MAC
 1180         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 1181         if (error == 0)
 1182 #endif
 1183                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 1184         fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
 1185         VOP_UNLOCK(vp);
 1186         if (need_finished_write)
 1187                 vn_finished_write(mp);
 1188         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 1189             orig_offset != uio->uio_offset)
 1190                 /*
 1191                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
 1192                  * for the backing file after a POSIX_FADV_NOREUSE
 1193                  * write(2).
 1194                  */
 1195                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 1196                     POSIX_FADV_DONTNEED);
 1197 unlock:
 1198         return (error);
 1199 }
 1200 
 1201 /*
 1202  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
 1203  * prevent the following deadlock:
 1204  *
 1205  * Assume that the thread A reads from the vnode vp1 into userspace
 1206  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
 1207  * currently not resident, then system ends up with the call chain
 1208  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
 1209  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
 1210  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
 1211  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
 1212  * backed by the pages of vnode vp1, and some page in buf2 is not
 1213  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
 1214  *
 1215  * To prevent the lock order reversal and deadlock, vn_io_fault() does
 1216  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
 1217  * Instead, it first tries to do the whole range i/o with pagefaults
 1218  * disabled. If all pages in the i/o buffer are resident and mapped,
 1219  * VOP will succeed (ignoring the genuine filesystem errors).
 1220  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
 1221  * i/o in chunks, with all pages in the chunk prefaulted and held
 1222  * using vm_fault_quick_hold_pages().
 1223  *
 1224  * Filesystems using this deadlock avoidance scheme should use the
 1225  * array of the held pages from uio, saved in the curthread->td_ma,
 1226  * instead of doing uiomove().  A helper function
 1227  * vn_io_fault_uiomove() converts uiomove request into
 1228  * uiomove_fromphys() over td_ma array.
 1229  *
 1230  * Since vnode locks do not cover the whole i/o anymore, rangelocks
 1231  * make the current i/o request atomic with respect to other i/os and
 1232  * truncations.
 1233  */
 1234 
 1235 /*
 1236  * Decode vn_io_fault_args and perform the corresponding i/o.
 1237  */
 1238 static int
 1239 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
 1240     struct thread *td)
 1241 {
 1242         int error, save;
 1243 
 1244         error = 0;
 1245         save = vm_fault_disable_pagefaults();
 1246         switch (args->kind) {
 1247         case VN_IO_FAULT_FOP:
 1248                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
 1249                     uio, args->cred, args->flags, td);
 1250                 break;
 1251         case VN_IO_FAULT_VOP:
 1252                 if (uio->uio_rw == UIO_READ) {
 1253                         error = VOP_READ(args->args.vop_args.vp, uio,
 1254                             args->flags, args->cred);
 1255                 } else if (uio->uio_rw == UIO_WRITE) {
 1256                         error = VOP_WRITE(args->args.vop_args.vp, uio,
 1257                             args->flags, args->cred);
 1258                 }
 1259                 break;
 1260         default:
 1261                 panic("vn_io_fault_doio: unknown kind of io %d %d",
 1262                     args->kind, uio->uio_rw);
 1263         }
 1264         vm_fault_enable_pagefaults(save);
 1265         return (error);
 1266 }
 1267 
 1268 static int
 1269 vn_io_fault_touch(char *base, const struct uio *uio)
 1270 {
 1271         int r;
 1272 
 1273         r = fubyte(base);
 1274         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
 1275                 return (EFAULT);
 1276         return (0);
 1277 }
 1278 
 1279 static int
 1280 vn_io_fault_prefault_user(const struct uio *uio)
 1281 {
 1282         char *base;
 1283         const struct iovec *iov;
 1284         size_t len;
 1285         ssize_t resid;
 1286         int error, i;
 1287 
 1288         KASSERT(uio->uio_segflg == UIO_USERSPACE,
 1289             ("vn_io_fault_prefault userspace"));
 1290 
 1291         error = i = 0;
 1292         iov = uio->uio_iov;
 1293         resid = uio->uio_resid;
 1294         base = iov->iov_base;
 1295         len = iov->iov_len;
 1296         while (resid > 0) {
 1297                 error = vn_io_fault_touch(base, uio);
 1298                 if (error != 0)
 1299                         break;
 1300                 if (len < PAGE_SIZE) {
 1301                         if (len != 0) {
 1302                                 error = vn_io_fault_touch(base + len - 1, uio);
 1303                                 if (error != 0)
 1304                                         break;
 1305                                 resid -= len;
 1306                         }
 1307                         if (++i >= uio->uio_iovcnt)
 1308                                 break;
 1309                         iov = uio->uio_iov + i;
 1310                         base = iov->iov_base;
 1311                         len = iov->iov_len;
 1312                 } else {
 1313                         len -= PAGE_SIZE;
 1314                         base += PAGE_SIZE;
 1315                         resid -= PAGE_SIZE;
 1316                 }
 1317         }
 1318         return (error);
 1319 }
 1320 
 1321 /*
 1322  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
 1323  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
 1324  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
 1325  * into args and call vn_io_fault1() to handle faults during the user
 1326  * mode buffer accesses.
 1327  */
 1328 static int
 1329 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
 1330     struct thread *td)
 1331 {
 1332         vm_page_t ma[io_hold_cnt + 2];
 1333         struct uio *uio_clone, short_uio;
 1334         struct iovec short_iovec[1];
 1335         vm_page_t *prev_td_ma;
 1336         vm_prot_t prot;
 1337         vm_offset_t addr, end;
 1338         size_t len, resid;
 1339         ssize_t adv;
 1340         int error, cnt, saveheld, prev_td_ma_cnt;
 1341 
 1342         if (vn_io_fault_prefault) {
 1343                 error = vn_io_fault_prefault_user(uio);
 1344                 if (error != 0)
 1345                         return (error); /* Or ignore ? */
 1346         }
 1347 
 1348         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 1349 
 1350         /*
 1351          * The UFS follows IO_UNIT directive and replays back both
 1352          * uio_offset and uio_resid if an error is encountered during the
 1353          * operation.  But, since the iovec may be already advanced,
 1354          * uio is still in an inconsistent state.
 1355          *
 1356          * Cache a copy of the original uio, which is advanced to the redo
 1357          * point using UIO_NOCOPY below.
 1358          */
 1359         uio_clone = cloneuio(uio);
 1360         resid = uio->uio_resid;
 1361 
 1362         short_uio.uio_segflg = UIO_USERSPACE;
 1363         short_uio.uio_rw = uio->uio_rw;
 1364         short_uio.uio_td = uio->uio_td;
 1365 
 1366         error = vn_io_fault_doio(args, uio, td);
 1367         if (error != EFAULT)
 1368                 goto out;
 1369 
 1370         atomic_add_long(&vn_io_faults_cnt, 1);
 1371         uio_clone->uio_segflg = UIO_NOCOPY;
 1372         uiomove(NULL, resid - uio->uio_resid, uio_clone);
 1373         uio_clone->uio_segflg = uio->uio_segflg;
 1374 
 1375         saveheld = curthread_pflags_set(TDP_UIOHELD);
 1376         prev_td_ma = td->td_ma;
 1377         prev_td_ma_cnt = td->td_ma_cnt;
 1378 
 1379         while (uio_clone->uio_resid != 0) {
 1380                 len = uio_clone->uio_iov->iov_len;
 1381                 if (len == 0) {
 1382                         KASSERT(uio_clone->uio_iovcnt >= 1,
 1383                             ("iovcnt underflow"));
 1384                         uio_clone->uio_iov++;
 1385                         uio_clone->uio_iovcnt--;
 1386                         continue;
 1387                 }
 1388                 if (len > ptoa(io_hold_cnt))
 1389                         len = ptoa(io_hold_cnt);
 1390                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 1391                 end = round_page(addr + len);
 1392                 if (end < addr) {
 1393                         error = EFAULT;
 1394                         break;
 1395                 }
 1396                 /*
 1397                  * A perfectly misaligned address and length could cause
 1398                  * both the start and the end of the chunk to use partial
 1399                  * page.  +2 accounts for such a situation.
 1400                  */
 1401                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 1402                     addr, len, prot, ma, io_hold_cnt + 2);
 1403                 if (cnt == -1) {
 1404                         error = EFAULT;
 1405                         break;
 1406                 }
 1407                 short_uio.uio_iov = &short_iovec[0];
 1408                 short_iovec[0].iov_base = (void *)addr;
 1409                 short_uio.uio_iovcnt = 1;
 1410                 short_uio.uio_resid = short_iovec[0].iov_len = len;
 1411                 short_uio.uio_offset = uio_clone->uio_offset;
 1412                 td->td_ma = ma;
 1413                 td->td_ma_cnt = cnt;
 1414 
 1415                 error = vn_io_fault_doio(args, &short_uio, td);
 1416                 vm_page_unhold_pages(ma, cnt);
 1417                 adv = len - short_uio.uio_resid;
 1418 
 1419                 uio_clone->uio_iov->iov_base =
 1420                     (char *)uio_clone->uio_iov->iov_base + adv;
 1421                 uio_clone->uio_iov->iov_len -= adv;
 1422                 uio_clone->uio_resid -= adv;
 1423                 uio_clone->uio_offset += adv;
 1424 
 1425                 uio->uio_resid -= adv;
 1426                 uio->uio_offset += adv;
 1427 
 1428                 if (error != 0 || adv == 0)
 1429                         break;
 1430         }
 1431         td->td_ma = prev_td_ma;
 1432         td->td_ma_cnt = prev_td_ma_cnt;
 1433         curthread_pflags_restore(saveheld);
 1434 out:
 1435         free(uio_clone, M_IOV);
 1436         return (error);
 1437 }
 1438 
 1439 static int
 1440 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
 1441     int flags, struct thread *td)
 1442 {
 1443         fo_rdwr_t *doio;
 1444         struct vnode *vp;
 1445         void *rl_cookie;
 1446         struct vn_io_fault_args args;
 1447         int error;
 1448 
 1449         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 1450         vp = fp->f_vnode;
 1451 
 1452         /*
 1453          * The ability to read(2) on a directory has historically been
 1454          * allowed for all users, but this can and has been the source of
 1455          * at least one security issue in the past.  As such, it is now hidden
 1456          * away behind a sysctl for those that actually need it to use it, and
 1457          * restricted to root when it's turned on to make it relatively safe to
 1458          * leave on for longer sessions of need.
 1459          */
 1460         if (vp->v_type == VDIR) {
 1461                 KASSERT(uio->uio_rw == UIO_READ,
 1462                     ("illegal write attempted on a directory"));
 1463                 if (!vfs_allow_read_dir)
 1464                         return (EISDIR);
 1465                 if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
 1466                         return (EISDIR);
 1467         }
 1468 
 1469         foffset_lock_uio(fp, uio, flags);
 1470         if (do_vn_io_fault(vp, uio)) {
 1471                 args.kind = VN_IO_FAULT_FOP;
 1472                 args.args.fop_args.fp = fp;
 1473                 args.args.fop_args.doio = doio;
 1474                 args.cred = active_cred;
 1475                 args.flags = flags | FOF_OFFSET;
 1476                 if (uio->uio_rw == UIO_READ) {
 1477                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 1478                             uio->uio_offset + uio->uio_resid);
 1479                 } else if ((fp->f_flag & O_APPEND) != 0 ||
 1480                     (flags & FOF_OFFSET) == 0) {
 1481                         /* For appenders, punt and lock the whole range. */
 1482                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 1483                 } else {
 1484                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 1485                             uio->uio_offset + uio->uio_resid);
 1486                 }
 1487                 error = vn_io_fault1(vp, uio, &args, td);
 1488                 vn_rangelock_unlock(vp, rl_cookie);
 1489         } else {
 1490                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 1491         }
 1492         foffset_unlock_uio(fp, uio, flags);
 1493         return (error);
 1494 }
 1495 
 1496 /*
 1497  * Helper function to perform the requested uiomove operation using
 1498  * the held pages for io->uio_iov[0].iov_base buffer instead of
 1499  * copyin/copyout.  Access to the pages with uiomove_fromphys()
 1500  * instead of iov_base prevents page faults that could occur due to
 1501  * pmap_collect() invalidating the mapping created by
 1502  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
 1503  * object cleanup revoking the write access from page mappings.
 1504  *
 1505  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
 1506  * instead of plain uiomove().
 1507  */
 1508 int
 1509 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 1510 {
 1511         struct uio transp_uio;
 1512         struct iovec transp_iov[1];
 1513         struct thread *td;
 1514         size_t adv;
 1515         int error, pgadv;
 1516 
 1517         td = curthread;
 1518         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 1519             uio->uio_segflg != UIO_USERSPACE)
 1520                 return (uiomove(data, xfersize, uio));
 1521 
 1522         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 1523         transp_iov[0].iov_base = data;
 1524         transp_uio.uio_iov = &transp_iov[0];
 1525         transp_uio.uio_iovcnt = 1;
 1526         if (xfersize > uio->uio_resid)
 1527                 xfersize = uio->uio_resid;
 1528         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 1529         transp_uio.uio_offset = 0;
 1530         transp_uio.uio_segflg = UIO_SYSSPACE;
 1531         /*
 1532          * Since transp_iov points to data, and td_ma page array
 1533          * corresponds to original uio->uio_iov, we need to invert the
 1534          * direction of the i/o operation as passed to
 1535          * uiomove_fromphys().
 1536          */
 1537         switch (uio->uio_rw) {
 1538         case UIO_WRITE:
 1539                 transp_uio.uio_rw = UIO_READ;
 1540                 break;
 1541         case UIO_READ:
 1542                 transp_uio.uio_rw = UIO_WRITE;
 1543                 break;
 1544         }
 1545         transp_uio.uio_td = uio->uio_td;
 1546         error = uiomove_fromphys(td->td_ma,
 1547             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 1548             xfersize, &transp_uio);
 1549         adv = xfersize - transp_uio.uio_resid;
 1550         pgadv =
 1551             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 1552             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 1553         td->td_ma += pgadv;
 1554         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 1555             pgadv));
 1556         td->td_ma_cnt -= pgadv;
 1557         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 1558         uio->uio_iov->iov_len -= adv;
 1559         uio->uio_resid -= adv;
 1560         uio->uio_offset += adv;
 1561         return (error);
 1562 }
 1563 
 1564 int
 1565 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 1566     struct uio *uio)
 1567 {
 1568         struct thread *td;
 1569         vm_offset_t iov_base;
 1570         int cnt, pgadv;
 1571 
 1572         td = curthread;
 1573         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 1574             uio->uio_segflg != UIO_USERSPACE)
 1575                 return (uiomove_fromphys(ma, offset, xfersize, uio));
 1576 
 1577         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 1578         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 1579         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 1580         switch (uio->uio_rw) {
 1581         case UIO_WRITE:
 1582                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 1583                     offset, cnt);
 1584                 break;
 1585         case UIO_READ:
 1586                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 1587                     cnt);
 1588                 break;
 1589         }
 1590         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 1591         td->td_ma += pgadv;
 1592         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 1593             pgadv));
 1594         td->td_ma_cnt -= pgadv;
 1595         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 1596         uio->uio_iov->iov_len -= cnt;
 1597         uio->uio_resid -= cnt;
 1598         uio->uio_offset += cnt;
 1599         return (0);
 1600 }
 1601 
 1602 /*
 1603  * File table truncate routine.
 1604  */
 1605 static int
 1606 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 1607     struct thread *td)
 1608 {
 1609         struct mount *mp;
 1610         struct vnode *vp;
 1611         void *rl_cookie;
 1612         int error;
 1613 
 1614         vp = fp->f_vnode;
 1615 
 1616 retry:
 1617         /*
 1618          * Lock the whole range for truncation.  Otherwise split i/o
 1619          * might happen partly before and partly after the truncation.
 1620          */
 1621         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 1622         error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 1623         if (error)
 1624                 goto out1;
 1625         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1626         AUDIT_ARG_VNODE1(vp);
 1627         if (vp->v_type == VDIR) {
 1628                 error = EISDIR;
 1629                 goto out;
 1630         }
 1631 #ifdef MAC
 1632         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 1633         if (error)
 1634                 goto out;
 1635 #endif
 1636         error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
 1637             fp->f_cred);
 1638 out:
 1639         VOP_UNLOCK(vp);
 1640         vn_finished_write(mp);
 1641 out1:
 1642         vn_rangelock_unlock(vp, rl_cookie);
 1643         if (error == ERELOOKUP)
 1644                 goto retry;
 1645         return (error);
 1646 }
 1647 
 1648 /*
 1649  * Truncate a file that is already locked.
 1650  */
 1651 int
 1652 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 1653     struct ucred *cred)
 1654 {
 1655         struct vattr vattr;
 1656         int error;
 1657 
 1658         error = VOP_ADD_WRITECOUNT(vp, 1);
 1659         if (error == 0) {
 1660                 VATTR_NULL(&vattr);
 1661                 vattr.va_size = length;
 1662                 if (sync)
 1663                         vattr.va_vaflags |= VA_SYNC;
 1664                 error = VOP_SETATTR(vp, &vattr, cred);
 1665                 VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 1666         }
 1667         return (error);
 1668 }
 1669 
 1670 /*
 1671  * File table vnode stat routine.
 1672  */
 1673 int
 1674 vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred)
 1675 {
 1676         struct vnode *vp = fp->f_vnode;
 1677         int error;
 1678 
 1679         vn_lock(vp, LK_SHARED | LK_RETRY);
 1680         error = VOP_STAT(vp, sb, active_cred, fp->f_cred);
 1681         VOP_UNLOCK(vp);
 1682 
 1683         return (error);
 1684 }
 1685 
 1686 /*
 1687  * File table vnode ioctl routine.
 1688  */
 1689 static int
 1690 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
 1691     struct thread *td)
 1692 {
 1693         struct vnode *vp;
 1694         struct fiobmap2_arg *bmarg;
 1695         off_t size;
 1696         int error;
 1697 
 1698         vp = fp->f_vnode;
 1699         switch (vp->v_type) {
 1700         case VDIR:
 1701         case VREG:
 1702                 switch (com) {
 1703                 case FIONREAD:
 1704                         error = vn_getsize(vp, &size, active_cred);
 1705                         if (error == 0)
 1706                                 *(int *)data = size - fp->f_offset;
 1707                         return (error);
 1708                 case FIOBMAP2:
 1709                         bmarg = (struct fiobmap2_arg *)data;
 1710                         vn_lock(vp, LK_SHARED | LK_RETRY);
 1711 #ifdef MAC
 1712                         error = mac_vnode_check_read(active_cred, fp->f_cred,
 1713                             vp);
 1714                         if (error == 0)
 1715 #endif
 1716                                 error = VOP_BMAP(vp, bmarg->bn, NULL,
 1717                                     &bmarg->bn, &bmarg->runp, &bmarg->runb);
 1718                         VOP_UNLOCK(vp);
 1719                         return (error);
 1720                 case FIONBIO:
 1721                 case FIOASYNC:
 1722                         return (0);
 1723                 default:
 1724                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
 1725                             active_cred, td));
 1726                 }
 1727                 break;
 1728         case VCHR:
 1729                 return (VOP_IOCTL(vp, com, data, fp->f_flag,
 1730                     active_cred, td));
 1731         default:
 1732                 return (ENOTTY);
 1733         }
 1734 }
 1735 
 1736 /*
 1737  * File table vnode poll routine.
 1738  */
 1739 static int
 1740 vn_poll(struct file *fp, int events, struct ucred *active_cred,
 1741     struct thread *td)
 1742 {
 1743         struct vnode *vp;
 1744         int error;
 1745 
 1746         vp = fp->f_vnode;
 1747 #if defined(MAC) || defined(AUDIT)
 1748         if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
 1749                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1750                 AUDIT_ARG_VNODE1(vp);
 1751                 error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 1752                 VOP_UNLOCK(vp);
 1753                 if (error != 0)
 1754                         return (error);
 1755         }
 1756 #endif
 1757         error = VOP_POLL(vp, events, fp->f_cred, td);
 1758         return (error);
 1759 }
 1760 
 1761 /*
 1762  * Acquire the requested lock and then check for validity.  LK_RETRY
 1763  * permits vn_lock to return doomed vnodes.
 1764  */
 1765 static int __noinline
 1766 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
 1767     int error)
 1768 {
 1769 
 1770         KASSERT((flags & LK_RETRY) == 0 || error == 0,
 1771             ("vn_lock: error %d incompatible with flags %#x", error, flags));
 1772 
 1773         if (error == 0)
 1774                 VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
 1775 
 1776         if ((flags & LK_RETRY) == 0) {
 1777                 if (error == 0) {
 1778                         VOP_UNLOCK(vp);
 1779                         error = ENOENT;
 1780                 }
 1781                 return (error);
 1782         }
 1783 
 1784         /*
 1785          * LK_RETRY case.
 1786          *
 1787          * Nothing to do if we got the lock.
 1788          */
 1789         if (error == 0)
 1790                 return (0);
 1791 
 1792         /*
 1793          * Interlock was dropped by the call in _vn_lock.
 1794          */
 1795         flags &= ~LK_INTERLOCK;
 1796         do {
 1797                 error = VOP_LOCK1(vp, flags, file, line);
 1798         } while (error != 0);
 1799         return (0);
 1800 }
 1801 
 1802 int
 1803 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
 1804 {
 1805         int error;
 1806 
 1807         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 1808             ("vn_lock: no locktype (%d passed)", flags));
 1809         VNPASS(vp->v_holdcnt > 0, vp);
 1810         error = VOP_LOCK1(vp, flags, file, line);
 1811         if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
 1812                 return (_vn_lock_fallback(vp, flags, file, line, error));
 1813         return (0);
 1814 }
 1815 
 1816 /*
 1817  * File table vnode close routine.
 1818  */
 1819 static int
 1820 vn_closefile(struct file *fp, struct thread *td)
 1821 {
 1822         struct vnode *vp;
 1823         struct flock lf;
 1824         int error;
 1825         bool ref;
 1826 
 1827         vp = fp->f_vnode;
 1828         fp->f_ops = &badfileops;
 1829         ref = (fp->f_flag & FHASLOCK) != 0;
 1830 
 1831         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 1832 
 1833         if (__predict_false(ref)) {
 1834                 lf.l_whence = SEEK_SET;
 1835                 lf.l_start = 0;
 1836                 lf.l_len = 0;
 1837                 lf.l_type = F_UNLCK;
 1838                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 1839                 vrele(vp);
 1840         }
 1841         return (error);
 1842 }
 1843 
 1844 /*
 1845  * Preparing to start a filesystem write operation. If the operation is
 1846  * permitted, then we bump the count of operations in progress and
 1847  * proceed. If a suspend request is in progress, we wait until the
 1848  * suspension is over, and then proceed.
 1849  */
 1850 static int
 1851 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
 1852 {
 1853         struct mount_pcpu *mpcpu;
 1854         int error, mflags;
 1855 
 1856         if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
 1857             vfs_op_thread_enter(mp, mpcpu)) {
 1858                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 1859                 vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
 1860                 vfs_op_thread_exit(mp, mpcpu);
 1861                 return (0);
 1862         }
 1863 
 1864         if (mplocked)
 1865                 mtx_assert(MNT_MTX(mp), MA_OWNED);
 1866         else
 1867                 MNT_ILOCK(mp);
 1868 
 1869         error = 0;
 1870 
 1871         /*
 1872          * Check on status of suspension.
 1873          */
 1874         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 1875             mp->mnt_susp_owner != curthread) {
 1876                 mflags = 0;
 1877                 if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 1878                         if (flags & V_PCATCH)
 1879                                 mflags |= PCATCH;
 1880                 }
 1881                 mflags |= (PUSER - 1);
 1882                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 1883                         if (flags & V_NOWAIT) {
 1884                                 error = EWOULDBLOCK;
 1885                                 goto unlock;
 1886                         }
 1887                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 1888                             "suspfs", 0);
 1889                         if (error)
 1890                                 goto unlock;
 1891                 }
 1892         }
 1893         if (flags & V_XSLEEP)
 1894                 goto unlock;
 1895         mp->mnt_writeopcount++;
 1896 unlock:
 1897         if (error != 0 || (flags & V_XSLEEP) != 0)
 1898                 MNT_REL(mp);
 1899         MNT_IUNLOCK(mp);
 1900         return (error);
 1901 }
 1902 
 1903 int
 1904 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 1905 {
 1906         struct mount *mp;
 1907         int error;
 1908 
 1909         KASSERT((flags & ~V_VALID_FLAGS) == 0,
 1910             ("%s: invalid flags passed %d\n", __func__, flags));
 1911 
 1912         error = 0;
 1913         /*
 1914          * If a vnode is provided, get and return the mount point that
 1915          * to which it will write.
 1916          */
 1917         if (vp != NULL) {
 1918                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1919                         *mpp = NULL;
 1920                         if (error != EOPNOTSUPP)
 1921                                 return (error);
 1922                         return (0);
 1923                 }
 1924         }
 1925         if ((mp = *mpp) == NULL)
 1926                 return (0);
 1927 
 1928         /*
 1929          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1930          * a vfs_ref().
 1931          * As long as a vnode is not provided we need to acquire a
 1932          * refcount for the provided mountpoint too, in order to
 1933          * emulate a vfs_ref().
 1934          */
 1935         if (vp == NULL)
 1936                 vfs_ref(mp);
 1937 
 1938         return (vn_start_write_refed(mp, flags, false));
 1939 }
 1940 
 1941 /*
 1942  * Secondary suspension. Used by operations such as vop_inactive
 1943  * routines that are needed by the higher level functions. These
 1944  * are allowed to proceed until all the higher level functions have
 1945  * completed (indicated by mnt_writeopcount dropping to zero). At that
 1946  * time, these operations are halted until the suspension is over.
 1947  */
 1948 int
 1949 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 1950 {
 1951         struct mount *mp;
 1952         int error, mflags;
 1953 
 1954         KASSERT((flags & ~V_VALID_FLAGS) == 0,
 1955             ("%s: invalid flags passed %d\n", __func__, flags));
 1956 
 1957  retry:
 1958         if (vp != NULL) {
 1959                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1960                         *mpp = NULL;
 1961                         if (error != EOPNOTSUPP)
 1962                                 return (error);
 1963                         return (0);
 1964                 }
 1965         }
 1966         /*
 1967          * If we are not suspended or have not yet reached suspended
 1968          * mode, then let the operation proceed.
 1969          */
 1970         if ((mp = *mpp) == NULL)
 1971                 return (0);
 1972 
 1973         /*
 1974          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1975          * a vfs_ref().
 1976          * As long as a vnode is not provided we need to acquire a
 1977          * refcount for the provided mountpoint too, in order to
 1978          * emulate a vfs_ref().
 1979          */
 1980         MNT_ILOCK(mp);
 1981         if (vp == NULL)
 1982                 MNT_REF(mp);
 1983         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 1984                 mp->mnt_secondary_writes++;
 1985                 mp->mnt_secondary_accwrites++;
 1986                 MNT_IUNLOCK(mp);
 1987                 return (0);
 1988         }
 1989         if (flags & V_NOWAIT) {
 1990                 MNT_REL(mp);
 1991                 MNT_IUNLOCK(mp);
 1992                 return (EWOULDBLOCK);
 1993         }
 1994         /*
 1995          * Wait for the suspension to finish.
 1996          */
 1997         mflags = 0;
 1998         if ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0) {
 1999                 if (flags & V_PCATCH)
 2000                         mflags |= PCATCH;
 2001         }
 2002         mflags |= (PUSER - 1) | PDROP;
 2003         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags, "suspfs", 0);
 2004         vfs_rel(mp);
 2005         if (error == 0)
 2006                 goto retry;
 2007         return (error);
 2008 }
 2009 
 2010 /*
 2011  * Filesystem write operation has completed. If we are suspending and this
 2012  * operation is the last one, notify the suspender that the suspension is
 2013  * now in effect.
 2014  */
 2015 void
 2016 vn_finished_write(struct mount *mp)
 2017 {
 2018         struct mount_pcpu *mpcpu;
 2019         int c;
 2020 
 2021         if (mp == NULL)
 2022                 return;
 2023 
 2024         if (vfs_op_thread_enter(mp, mpcpu)) {
 2025                 vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
 2026                 vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 2027                 vfs_op_thread_exit(mp, mpcpu);
 2028                 return;
 2029         }
 2030 
 2031         MNT_ILOCK(mp);
 2032         vfs_assert_mount_counters(mp);
 2033         MNT_REL(mp);
 2034         c = --mp->mnt_writeopcount;
 2035         if (mp->mnt_vfs_ops == 0) {
 2036                 MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 2037                 MNT_IUNLOCK(mp);
 2038                 return;
 2039         }
 2040         if (c < 0)
 2041                 vfs_dump_mount_counters(mp);
 2042         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
 2043                 wakeup(&mp->mnt_writeopcount);
 2044         MNT_IUNLOCK(mp);
 2045 }
 2046 
 2047 /*
 2048  * Filesystem secondary write operation has completed. If we are
 2049  * suspending and this operation is the last one, notify the suspender
 2050  * that the suspension is now in effect.
 2051  */
 2052 void
 2053 vn_finished_secondary_write(struct mount *mp)
 2054 {
 2055         if (mp == NULL)
 2056                 return;
 2057         MNT_ILOCK(mp);
 2058         MNT_REL(mp);
 2059         mp->mnt_secondary_writes--;
 2060         if (mp->mnt_secondary_writes < 0)
 2061                 panic("vn_finished_secondary_write: neg cnt");
 2062         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 2063             mp->mnt_secondary_writes <= 0)
 2064                 wakeup(&mp->mnt_secondary_writes);
 2065         MNT_IUNLOCK(mp);
 2066 }
 2067 
 2068 /*
 2069  * Request a filesystem to suspend write operations.
 2070  */
 2071 int
 2072 vfs_write_suspend(struct mount *mp, int flags)
 2073 {
 2074         int error;
 2075 
 2076         vfs_op_enter(mp);
 2077 
 2078         MNT_ILOCK(mp);
 2079         vfs_assert_mount_counters(mp);
 2080         if (mp->mnt_susp_owner == curthread) {
 2081                 vfs_op_exit_locked(mp);
 2082                 MNT_IUNLOCK(mp);
 2083                 return (EALREADY);
 2084         }
 2085         while (mp->mnt_kern_flag & MNTK_SUSPEND)
 2086                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 2087 
 2088         /*
 2089          * Unmount holds a write reference on the mount point.  If we
 2090          * own busy reference and drain for writers, we deadlock with
 2091          * the reference draining in the unmount path.  Callers of
 2092          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 2093          * vfs_busy() reference is owned and caller is not in the
 2094          * unmount context.
 2095          */
 2096         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 2097             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 2098                 vfs_op_exit_locked(mp);
 2099                 MNT_IUNLOCK(mp);
 2100                 return (EBUSY);
 2101         }
 2102 
 2103         mp->mnt_kern_flag |= MNTK_SUSPEND;
 2104         mp->mnt_susp_owner = curthread;
 2105         if (mp->mnt_writeopcount > 0)
 2106                 (void) msleep(&mp->mnt_writeopcount, 
 2107                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 2108         else
 2109                 MNT_IUNLOCK(mp);
 2110         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
 2111                 vfs_write_resume(mp, 0);
 2112                 /* vfs_write_resume does vfs_op_exit() for us */
 2113         }
 2114         return (error);
 2115 }
 2116 
 2117 /*
 2118  * Request a filesystem to resume write operations.
 2119  */
 2120 void
 2121 vfs_write_resume(struct mount *mp, int flags)
 2122 {
 2123 
 2124         MNT_ILOCK(mp);
 2125         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 2126                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 2127                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 2128                                        MNTK_SUSPENDED);
 2129                 mp->mnt_susp_owner = NULL;
 2130                 wakeup(&mp->mnt_writeopcount);
 2131                 wakeup(&mp->mnt_flag);
 2132                 curthread->td_pflags &= ~TDP_IGNSUSP;
 2133                 if ((flags & VR_START_WRITE) != 0) {
 2134                         MNT_REF(mp);
 2135                         mp->mnt_writeopcount++;
 2136                 }
 2137                 MNT_IUNLOCK(mp);
 2138                 if ((flags & VR_NO_SUSPCLR) == 0)
 2139                         VFS_SUSP_CLEAN(mp);
 2140                 vfs_op_exit(mp);
 2141         } else if ((flags & VR_START_WRITE) != 0) {
 2142                 MNT_REF(mp);
 2143                 vn_start_write_refed(mp, 0, true);
 2144         } else {
 2145                 MNT_IUNLOCK(mp);
 2146         }
 2147 }
 2148 
 2149 /*
 2150  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
 2151  * methods.
 2152  */
 2153 int
 2154 vfs_write_suspend_umnt(struct mount *mp)
 2155 {
 2156         int error;
 2157 
 2158         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 2159             ("vfs_write_suspend_umnt: recursed"));
 2160 
 2161         /* dounmount() already called vn_start_write(). */
 2162         for (;;) {
 2163                 vn_finished_write(mp);
 2164                 error = vfs_write_suspend(mp, 0);
 2165                 if (error != 0) {
 2166                         vn_start_write(NULL, &mp, V_WAIT);
 2167                         return (error);
 2168                 }
 2169                 MNT_ILOCK(mp);
 2170                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 2171                         break;
 2172                 MNT_IUNLOCK(mp);
 2173                 vn_start_write(NULL, &mp, V_WAIT);
 2174         }
 2175         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 2176         wakeup(&mp->mnt_flag);
 2177         MNT_IUNLOCK(mp);
 2178         curthread->td_pflags |= TDP_IGNSUSP;
 2179         return (0);
 2180 }
 2181 
 2182 /*
 2183  * Implement kqueues for files by translating it to vnode operation.
 2184  */
 2185 static int
 2186 vn_kqfilter(struct file *fp, struct knote *kn)
 2187 {
 2188 
 2189         return (VOP_KQFILTER(fp->f_vnode, kn));
 2190 }
 2191 
 2192 int
 2193 vn_kqfilter_opath(struct file *fp, struct knote *kn)
 2194 {
 2195         if ((fp->f_flag & FKQALLOWED) == 0)
 2196                 return (EBADF);
 2197         return (vn_kqfilter(fp, kn));
 2198 }
 2199 
 2200 /*
 2201  * Simplified in-kernel wrapper calls for extended attribute access.
 2202  * Both calls pass in a NULL credential, authorizing as "kernel" access.
 2203  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 2204  */
 2205 int
 2206 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 2207     const char *attrname, int *buflen, char *buf, struct thread *td)
 2208 {
 2209         struct uio      auio;
 2210         struct iovec    iov;
 2211         int     error;
 2212 
 2213         iov.iov_len = *buflen;
 2214         iov.iov_base = buf;
 2215 
 2216         auio.uio_iov = &iov;
 2217         auio.uio_iovcnt = 1;
 2218         auio.uio_rw = UIO_READ;
 2219         auio.uio_segflg = UIO_SYSSPACE;
 2220         auio.uio_td = td;
 2221         auio.uio_offset = 0;
 2222         auio.uio_resid = *buflen;
 2223 
 2224         if ((ioflg & IO_NODELOCKED) == 0)
 2225                 vn_lock(vp, LK_SHARED | LK_RETRY);
 2226 
 2227         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 2228 
 2229         /* authorize attribute retrieval as kernel */
 2230         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 2231             td);
 2232 
 2233         if ((ioflg & IO_NODELOCKED) == 0)
 2234                 VOP_UNLOCK(vp);
 2235 
 2236         if (error == 0) {
 2237                 *buflen = *buflen - auio.uio_resid;
 2238         }
 2239 
 2240         return (error);
 2241 }
 2242 
 2243 /*
 2244  * XXX failure mode if partially written?
 2245  */
 2246 int
 2247 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 2248     const char *attrname, int buflen, char *buf, struct thread *td)
 2249 {
 2250         struct uio      auio;
 2251         struct iovec    iov;
 2252         struct mount    *mp;
 2253         int     error;
 2254 
 2255         iov.iov_len = buflen;
 2256         iov.iov_base = buf;
 2257 
 2258         auio.uio_iov = &iov;
 2259         auio.uio_iovcnt = 1;
 2260         auio.uio_rw = UIO_WRITE;
 2261         auio.uio_segflg = UIO_SYSSPACE;
 2262         auio.uio_td = td;
 2263         auio.uio_offset = 0;
 2264         auio.uio_resid = buflen;
 2265 
 2266         if ((ioflg & IO_NODELOCKED) == 0) {
 2267                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 2268                         return (error);
 2269                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2270         }
 2271 
 2272         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 2273 
 2274         /* authorize attribute setting as kernel */
 2275         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 2276 
 2277         if ((ioflg & IO_NODELOCKED) == 0) {
 2278                 vn_finished_write(mp);
 2279                 VOP_UNLOCK(vp);
 2280         }
 2281 
 2282         return (error);
 2283 }
 2284 
 2285 int
 2286 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 2287     const char *attrname, struct thread *td)
 2288 {
 2289         struct mount    *mp;
 2290         int     error;
 2291 
 2292         if ((ioflg & IO_NODELOCKED) == 0) {
 2293                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 2294                         return (error);
 2295                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2296         }
 2297 
 2298         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 2299 
 2300         /* authorize attribute removal as kernel */
 2301         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 2302         if (error == EOPNOTSUPP)
 2303                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 2304                     NULL, td);
 2305 
 2306         if ((ioflg & IO_NODELOCKED) == 0) {
 2307                 vn_finished_write(mp);
 2308                 VOP_UNLOCK(vp);
 2309         }
 2310 
 2311         return (error);
 2312 }
 2313 
 2314 static int
 2315 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
 2316     struct vnode **rvp)
 2317 {
 2318 
 2319         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 2320 }
 2321 
 2322 int
 2323 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 2324 {
 2325 
 2326         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 2327             lkflags, rvp));
 2328 }
 2329 
 2330 int
 2331 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
 2332     int lkflags, struct vnode **rvp)
 2333 {
 2334         struct mount *mp;
 2335         int ltype, error;
 2336 
 2337         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 2338         mp = vp->v_mount;
 2339         ltype = VOP_ISLOCKED(vp);
 2340         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 2341             ("vn_vget_ino: vp not locked"));
 2342         error = vfs_busy(mp, MBF_NOWAIT);
 2343         if (error != 0) {
 2344                 vfs_ref(mp);
 2345                 VOP_UNLOCK(vp);
 2346                 error = vfs_busy(mp, 0);
 2347                 vn_lock(vp, ltype | LK_RETRY);
 2348                 vfs_rel(mp);
 2349                 if (error != 0)
 2350                         return (ENOENT);
 2351                 if (VN_IS_DOOMED(vp)) {
 2352                         vfs_unbusy(mp);
 2353                         return (ENOENT);
 2354                 }
 2355         }
 2356         VOP_UNLOCK(vp);
 2357         error = alloc(mp, alloc_arg, lkflags, rvp);
 2358         vfs_unbusy(mp);
 2359         if (error != 0 || *rvp != vp)
 2360                 vn_lock(vp, ltype | LK_RETRY);
 2361         if (VN_IS_DOOMED(vp)) {
 2362                 if (error == 0) {
 2363                         if (*rvp == vp)
 2364                                 vunref(vp);
 2365                         else
 2366                                 vput(*rvp);
 2367                 }
 2368                 error = ENOENT;
 2369         }
 2370         return (error);
 2371 }
 2372 
 2373 static void
 2374 vn_send_sigxfsz(struct proc *p)
 2375 {
 2376         PROC_LOCK(p);
 2377         kern_psignal(p, SIGXFSZ);
 2378         PROC_UNLOCK(p);
 2379 }
 2380 
 2381 int
 2382 vn_rlimit_trunc(u_quad_t size, struct thread *td)
 2383 {
 2384         if (size <= lim_cur(td, RLIMIT_FSIZE))
 2385                 return (0);
 2386         vn_send_sigxfsz(td->td_proc);
 2387         return (EFBIG);
 2388 }
 2389 
 2390 static int
 2391 vn_rlimit_fsizex1(const struct vnode *vp, struct uio *uio, off_t maxfsz,
 2392     bool adj, struct thread *td)
 2393 {
 2394         off_t lim;
 2395         bool ktr_write;
 2396 
 2397         if (vp->v_type != VREG)
 2398                 return (0);
 2399 
 2400         /*
 2401          * Handle file system maximum file size.
 2402          */
 2403         if (maxfsz != 0 && uio->uio_offset + uio->uio_resid > maxfsz) {
 2404                 if (!adj || uio->uio_offset >= maxfsz)
 2405                         return (EFBIG);
 2406                 uio->uio_resid = maxfsz - uio->uio_offset;
 2407         }
 2408 
 2409         /*
 2410          * This is kernel write (e.g. vnode_pager) or accounting
 2411          * write, ignore limit.
 2412          */
 2413         if (td == NULL || (td->td_pflags2 & TDP2_ACCT) != 0)
 2414                 return (0);
 2415 
 2416         /*
 2417          * Calculate file size limit.
 2418          */
 2419         ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
 2420         lim = __predict_false(ktr_write) ? td->td_ktr_io_lim :
 2421             lim_cur(td, RLIMIT_FSIZE);
 2422 
 2423         /*
 2424          * Is the limit reached?
 2425          */
 2426         if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
 2427                 return (0);
 2428 
 2429         /*
 2430          * Prepared filesystems can handle writes truncated to the
 2431          * file size limit.
 2432          */
 2433         if (adj && (uoff_t)uio->uio_offset < lim) {
 2434                 uio->uio_resid = lim - (uoff_t)uio->uio_offset;
 2435                 return (0);
 2436         }
 2437 
 2438         if (!ktr_write || ktr_filesize_limit_signal)
 2439                 vn_send_sigxfsz(td->td_proc);
 2440         return (EFBIG);
 2441 }
 2442 
 2443 /*
 2444  * Helper for VOP_WRITE() implementations, the common code to
 2445  * handle maximum supported file size on the filesystem, and
 2446  * RLIMIT_FSIZE, except for special writes from accounting subsystem
 2447  * and ktrace.
 2448  *
 2449  * For maximum file size (maxfsz argument):
 2450  * - return EFBIG if uio_offset is beyond it
 2451  * - otherwise, clamp uio_resid if write would extend file beyond maxfsz.
 2452  *
 2453  * For RLIMIT_FSIZE:
 2454  * - return EFBIG and send SIGXFSZ if uio_offset is beyond the limit
 2455  * - otherwise, clamp uio_resid if write would extend file beyond limit.
 2456  *
 2457  * If clamping occured, the adjustment for uio_resid is stored in
 2458  * *resid_adj, to be re-applied by vn_rlimit_fsizex_res() on return
 2459  * from the VOP.
 2460  */
 2461 int
 2462 vn_rlimit_fsizex(const struct vnode *vp, struct uio *uio, off_t maxfsz,
 2463     ssize_t *resid_adj, struct thread *td)
 2464 {
 2465         ssize_t resid_orig;
 2466         int error;
 2467         bool adj;
 2468 
 2469         resid_orig = uio->uio_resid;
 2470         adj = resid_adj != NULL;
 2471         error = vn_rlimit_fsizex1(vp, uio, maxfsz, adj, td);
 2472         if (adj)
 2473                 *resid_adj = resid_orig - uio->uio_resid;
 2474         return (error);
 2475 }
 2476 
 2477 void
 2478 vn_rlimit_fsizex_res(struct uio *uio, ssize_t resid_adj)
 2479 {
 2480         uio->uio_resid += resid_adj;
 2481 }
 2482 
 2483 int
 2484 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
 2485     struct thread *td)
 2486 {
 2487         return (vn_rlimit_fsizex(vp, __DECONST(struct uio *, uio), 0, NULL,
 2488             td));
 2489 }
 2490 
 2491 int
 2492 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 2493     struct thread *td)
 2494 {
 2495         struct vnode *vp;
 2496 
 2497         vp = fp->f_vnode;
 2498 #ifdef AUDIT
 2499         vn_lock(vp, LK_SHARED | LK_RETRY);
 2500         AUDIT_ARG_VNODE1(vp);
 2501         VOP_UNLOCK(vp);
 2502 #endif
 2503         return (setfmode(td, active_cred, vp, mode));
 2504 }
 2505 
 2506 int
 2507 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 2508     struct thread *td)
 2509 {
 2510         struct vnode *vp;
 2511 
 2512         vp = fp->f_vnode;
 2513 #ifdef AUDIT
 2514         vn_lock(vp, LK_SHARED | LK_RETRY);
 2515         AUDIT_ARG_VNODE1(vp);
 2516         VOP_UNLOCK(vp);
 2517 #endif
 2518         return (setfown(td, active_cred, vp, uid, gid));
 2519 }
 2520 
 2521 /*
 2522  * Remove pages in the range ["start", "end") from the vnode's VM object.  If
 2523  * "end" is 0, then the range extends to the end of the object.
 2524  */
 2525 void
 2526 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 2527 {
 2528         vm_object_t object;
 2529 
 2530         if ((object = vp->v_object) == NULL)
 2531                 return;
 2532         VM_OBJECT_WLOCK(object);
 2533         vm_object_page_remove(object, start, end, 0);
 2534         VM_OBJECT_WUNLOCK(object);
 2535 }
 2536 
 2537 /*
 2538  * Like vn_pages_remove(), but skips invalid pages, which by definition are not
 2539  * mapped into any process' address space.  Filesystems may use this in
 2540  * preference to vn_pages_remove() to avoid blocking on pages busied in
 2541  * preparation for a VOP_GETPAGES.
 2542  */
 2543 void
 2544 vn_pages_remove_valid(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 2545 {
 2546         vm_object_t object;
 2547 
 2548         if ((object = vp->v_object) == NULL)
 2549                 return;
 2550         VM_OBJECT_WLOCK(object);
 2551         vm_object_page_remove(object, start, end, OBJPR_VALIDONLY);
 2552         VM_OBJECT_WUNLOCK(object);
 2553 }
 2554 
 2555 int
 2556 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
 2557     struct ucred *cred)
 2558 {
 2559         off_t size;
 2560         daddr_t bn, bnp;
 2561         uint64_t bsize;
 2562         off_t noff;
 2563         int error;
 2564 
 2565         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 2566             ("%s: Wrong command %lu", __func__, cmd));
 2567         ASSERT_VOP_LOCKED(vp, "vn_bmap_seekhole_locked");
 2568 
 2569         if (vp->v_type != VREG) {
 2570                 error = ENOTTY;
 2571                 goto out;
 2572         }
 2573         error = vn_getsize_locked(vp, &size, cred);
 2574         if (error != 0)
 2575                 goto out;
 2576         noff = *off;
 2577         if (noff < 0 || noff >= size) {
 2578                 error = ENXIO;
 2579                 goto out;
 2580         }
 2581         bsize = vp->v_mount->mnt_stat.f_iosize;
 2582         for (bn = noff / bsize; noff < size; bn++, noff += bsize -
 2583             noff % bsize) {
 2584                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 2585                 if (error == EOPNOTSUPP) {
 2586                         error = ENOTTY;
 2587                         goto out;
 2588                 }
 2589                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 2590                     (bnp != -1 && cmd == FIOSEEKDATA)) {
 2591                         noff = bn * bsize;
 2592                         if (noff < *off)
 2593                                 noff = *off;
 2594                         goto out;
 2595                 }
 2596         }
 2597         if (noff > size)
 2598                 noff = size;
 2599         /* noff == size. There is an implicit hole at the end of file. */
 2600         if (cmd == FIOSEEKDATA)
 2601                 error = ENXIO;
 2602 out:
 2603         if (error == 0)
 2604                 *off = noff;
 2605         return (error);
 2606 }
 2607 
 2608 int
 2609 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 2610 {
 2611         int error;
 2612 
 2613         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 2614             ("%s: Wrong command %lu", __func__, cmd));
 2615 
 2616         if (vn_lock(vp, LK_SHARED) != 0)
 2617                 return (EBADF);
 2618         error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
 2619         VOP_UNLOCK(vp);
 2620         return (error);
 2621 }
 2622 
 2623 int
 2624 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 2625 {
 2626         struct ucred *cred;
 2627         struct vnode *vp;
 2628         off_t foffset, fsize, size;
 2629         int error, noneg;
 2630 
 2631         cred = td->td_ucred;
 2632         vp = fp->f_vnode;
 2633         foffset = foffset_lock(fp, 0);
 2634         noneg = (vp->v_type != VCHR);
 2635         error = 0;
 2636         switch (whence) {
 2637         case L_INCR:
 2638                 if (noneg &&
 2639                     (foffset < 0 ||
 2640                     (offset > 0 && foffset > OFF_MAX - offset))) {
 2641                         error = EOVERFLOW;
 2642                         break;
 2643                 }
 2644                 offset += foffset;
 2645                 break;
 2646         case L_XTND:
 2647                 error = vn_getsize(vp, &fsize, cred);
 2648                 if (error != 0)
 2649                         break;
 2650 
 2651                 /*
 2652                  * If the file references a disk device, then fetch
 2653                  * the media size and use that to determine the ending
 2654                  * offset.
 2655                  */
 2656                 if (fsize == 0 && vp->v_type == VCHR &&
 2657                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 2658                         fsize = size;
 2659                 if (noneg && offset > 0 && fsize > OFF_MAX - offset) {
 2660                         error = EOVERFLOW;
 2661                         break;
 2662                 }
 2663                 offset += fsize;
 2664                 break;
 2665         case L_SET:
 2666                 break;
 2667         case SEEK_DATA:
 2668                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 2669                 if (error == ENOTTY)
 2670                         error = EINVAL;
 2671                 break;
 2672         case SEEK_HOLE:
 2673                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 2674                 if (error == ENOTTY)
 2675                         error = EINVAL;
 2676                 break;
 2677         default:
 2678                 error = EINVAL;
 2679         }
 2680         if (error == 0 && noneg && offset < 0)
 2681                 error = EINVAL;
 2682         if (error != 0)
 2683                 goto drop;
 2684         VFS_KNOTE_UNLOCKED(vp, 0);
 2685         td->td_uretoff.tdu_off = offset;
 2686 drop:
 2687         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 2688         return (error);
 2689 }
 2690 
 2691 int
 2692 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
 2693     struct thread *td)
 2694 {
 2695         int error;
 2696 
 2697         /*
 2698          * Grant permission if the caller is the owner of the file, or
 2699          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 2700          * on the file.  If the time pointer is null, then write
 2701          * permission on the file is also sufficient.
 2702          *
 2703          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 2704          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 2705          * will be allowed to set the times [..] to the current
 2706          * server time.
 2707          */
 2708         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 2709         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 2710                 error = VOP_ACCESS(vp, VWRITE, cred, td);
 2711         return (error);
 2712 }
 2713 
 2714 int
 2715 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 2716 {
 2717         struct vnode *vp;
 2718         int error;
 2719 
 2720         if (fp->f_type == DTYPE_FIFO)
 2721                 kif->kf_type = KF_TYPE_FIFO;
 2722         else
 2723                 kif->kf_type = KF_TYPE_VNODE;
 2724         vp = fp->f_vnode;
 2725         vref(vp);
 2726         FILEDESC_SUNLOCK(fdp);
 2727         error = vn_fill_kinfo_vnode(vp, kif);
 2728         vrele(vp);
 2729         FILEDESC_SLOCK(fdp);
 2730         return (error);
 2731 }
 2732 
 2733 static inline void
 2734 vn_fill_junk(struct kinfo_file *kif)
 2735 {
 2736         size_t len, olen;
 2737 
 2738         /*
 2739          * Simulate vn_fullpath returning changing values for a given
 2740          * vp during e.g. coredump.
 2741          */
 2742         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 2743         olen = strlen(kif->kf_path);
 2744         if (len < olen)
 2745                 strcpy(&kif->kf_path[len - 1], "$");
 2746         else
 2747                 for (; olen < len; olen++)
 2748                         strcpy(&kif->kf_path[olen], "A");
 2749 }
 2750 
 2751 int
 2752 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 2753 {
 2754         struct vattr va;
 2755         char *fullpath, *freepath;
 2756         int error;
 2757 
 2758         kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
 2759         freepath = NULL;
 2760         fullpath = "-";
 2761         error = vn_fullpath(vp, &fullpath, &freepath);
 2762         if (error == 0) {
 2763                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 2764         }
 2765         if (freepath != NULL)
 2766                 free(freepath, M_TEMP);
 2767 
 2768         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 2769                 vn_fill_junk(kif);
 2770         );
 2771 
 2772         /*
 2773          * Retrieve vnode attributes.
 2774          */
 2775         va.va_fsid = VNOVAL;
 2776         va.va_rdev = NODEV;
 2777         vn_lock(vp, LK_SHARED | LK_RETRY);
 2778         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 2779         VOP_UNLOCK(vp);
 2780         if (error != 0)
 2781                 return (error);
 2782         if (va.va_fsid != VNOVAL)
 2783                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 2784         else
 2785                 kif->kf_un.kf_file.kf_file_fsid =
 2786                     vp->v_mount->mnt_stat.f_fsid.val[0];
 2787         kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
 2788             kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 2789         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 2790         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 2791         kif->kf_un.kf_file.kf_file_size = va.va_size;
 2792         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 2793         kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
 2794             kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 2795         kif->kf_un.kf_file.kf_file_nlink = va.va_nlink;
 2796         return (0);
 2797 }
 2798 
 2799 int
 2800 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
 2801     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
 2802     struct thread *td)
 2803 {
 2804 #ifdef HWPMC_HOOKS
 2805         struct pmckern_map_in pkm;
 2806 #endif
 2807         struct mount *mp;
 2808         struct vnode *vp;
 2809         vm_object_t object;
 2810         vm_prot_t maxprot;
 2811         boolean_t writecounted;
 2812         int error;
 2813 
 2814 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
 2815     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 2816         /*
 2817          * POSIX shared-memory objects are defined to have
 2818          * kernel persistence, and are not defined to support
 2819          * read(2)/write(2) -- or even open(2).  Thus, we can
 2820          * use MAP_ASYNC to trade on-disk coherence for speed.
 2821          * The shm_open(3) library routine turns on the FPOSIXSHM
 2822          * flag to request this behavior.
 2823          */
 2824         if ((fp->f_flag & FPOSIXSHM) != 0)
 2825                 flags |= MAP_NOSYNC;
 2826 #endif
 2827         vp = fp->f_vnode;
 2828 
 2829         /*
 2830          * Ensure that file and memory protections are
 2831          * compatible.  Note that we only worry about
 2832          * writability if mapping is shared; in this case,
 2833          * current and max prot are dictated by the open file.
 2834          * XXX use the vnode instead?  Problem is: what
 2835          * credentials do we use for determination? What if
 2836          * proc does a setuid?
 2837          */
 2838         mp = vp->v_mount;
 2839         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 2840                 maxprot = VM_PROT_NONE;
 2841                 if ((prot & VM_PROT_EXECUTE) != 0)
 2842                         return (EACCES);
 2843         } else
 2844                 maxprot = VM_PROT_EXECUTE;
 2845         if ((fp->f_flag & FREAD) != 0)
 2846                 maxprot |= VM_PROT_READ;
 2847         else if ((prot & VM_PROT_READ) != 0)
 2848                 return (EACCES);
 2849 
 2850         /*
 2851          * If we are sharing potential changes via MAP_SHARED and we
 2852          * are trying to get write permission although we opened it
 2853          * without asking for it, bail out.
 2854          */
 2855         if ((flags & MAP_SHARED) != 0) {
 2856                 if ((fp->f_flag & FWRITE) != 0)
 2857                         maxprot |= VM_PROT_WRITE;
 2858                 else if ((prot & VM_PROT_WRITE) != 0)
 2859                         return (EACCES);
 2860         } else {
 2861                 maxprot |= VM_PROT_WRITE;
 2862                 cap_maxprot |= VM_PROT_WRITE;
 2863         }
 2864         maxprot &= cap_maxprot;
 2865 
 2866         /*
 2867          * For regular files and shared memory, POSIX requires that
 2868          * the value of foff be a legitimate offset within the data
 2869          * object.  In particular, negative offsets are invalid.
 2870          * Blocking negative offsets and overflows here avoids
 2871          * possible wraparound or user-level access into reserved
 2872          * ranges of the data object later.  In contrast, POSIX does
 2873          * not dictate how offsets are used by device drivers, so in
 2874          * the case of a device mapping a negative offset is passed
 2875          * on.
 2876          */
 2877         if (
 2878 #ifdef _LP64
 2879             size > OFF_MAX ||
 2880 #endif
 2881             foff > OFF_MAX - size)
 2882                 return (EINVAL);
 2883 
 2884         writecounted = FALSE;
 2885         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 2886             &foff, &object, &writecounted);
 2887         if (error != 0)
 2888                 return (error);
 2889         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 2890             foff, writecounted, td);
 2891         if (error != 0) {
 2892                 /*
 2893                  * If this mapping was accounted for in the vnode's
 2894                  * writecount, then undo that now.
 2895                  */
 2896                 if (writecounted)
 2897                         vm_pager_release_writecount(object, 0, size);
 2898                 vm_object_deallocate(object);
 2899         }
 2900 #ifdef HWPMC_HOOKS
 2901         /* Inform hwpmc(4) if an executable is being mapped. */
 2902         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 2903                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 2904                         pkm.pm_file = vp;
 2905                         pkm.pm_address = (uintptr_t) *addr;
 2906                         PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
 2907                 }
 2908         }
 2909 #endif
 2910         return (error);
 2911 }
 2912 
 2913 void
 2914 vn_fsid(struct vnode *vp, struct vattr *va)
 2915 {
 2916         fsid_t *f;
 2917 
 2918         f = &vp->v_mount->mnt_stat.f_fsid;
 2919         va->va_fsid = (uint32_t)f->val[1];
 2920         va->va_fsid <<= sizeof(f->val[1]) * NBBY;
 2921         va->va_fsid += (uint32_t)f->val[0];
 2922 }
 2923 
 2924 int
 2925 vn_fsync_buf(struct vnode *vp, int waitfor)
 2926 {
 2927         struct buf *bp, *nbp;
 2928         struct bufobj *bo;
 2929         struct mount *mp;
 2930         int error, maxretry;
 2931 
 2932         error = 0;
 2933         maxretry = 10000;     /* large, arbitrarily chosen */
 2934         mp = NULL;
 2935         if (vp->v_type == VCHR) {
 2936                 VI_LOCK(vp);
 2937                 mp = vp->v_rdev->si_mountpt;
 2938                 VI_UNLOCK(vp);
 2939         }
 2940         bo = &vp->v_bufobj;
 2941         BO_LOCK(bo);
 2942 loop1:
 2943         /*
 2944          * MARK/SCAN initialization to avoid infinite loops.
 2945          */
 2946         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 2947                 bp->b_vflags &= ~BV_SCANNED;
 2948                 bp->b_error = 0;
 2949         }
 2950 
 2951         /*
 2952          * Flush all dirty buffers associated with a vnode.
 2953          */
 2954 loop2:
 2955         TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 2956                 if ((bp->b_vflags & BV_SCANNED) != 0)
 2957                         continue;
 2958                 bp->b_vflags |= BV_SCANNED;
 2959                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 2960                         if (waitfor != MNT_WAIT)
 2961                                 continue;
 2962                         if (BUF_LOCK(bp,
 2963                             LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
 2964                             BO_LOCKPTR(bo)) != 0) {
 2965                                 BO_LOCK(bo);
 2966                                 goto loop1;
 2967                         }
 2968                         BO_LOCK(bo);
 2969                 }
 2970                 BO_UNLOCK(bo);
 2971                 KASSERT(bp->b_bufobj == bo,
 2972                     ("bp %p wrong b_bufobj %p should be %p",
 2973                     bp, bp->b_bufobj, bo));
 2974                 if ((bp->b_flags & B_DELWRI) == 0)
 2975                         panic("fsync: not dirty");
 2976                 if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 2977                         vfs_bio_awrite(bp);
 2978                 } else {
 2979                         bremfree(bp);
 2980                         bawrite(bp);
 2981                 }
 2982                 if (maxretry < 1000)
 2983                         pause("dirty", hz < 1000 ? 1 : hz / 1000);
 2984                 BO_LOCK(bo);
 2985                 goto loop2;
 2986         }
 2987 
 2988         /*
 2989          * If synchronous the caller expects us to completely resolve all
 2990          * dirty buffers in the system.  Wait for in-progress I/O to
 2991          * complete (which could include background bitmap writes), then
 2992          * retry if dirty blocks still exist.
 2993          */
 2994         if (waitfor == MNT_WAIT) {
 2995                 bufobj_wwait(bo, 0, 0);
 2996                 if (bo->bo_dirty.bv_cnt > 0) {
 2997                         /*
 2998                          * If we are unable to write any of these buffers
 2999                          * then we fail now rather than trying endlessly
 3000                          * to write them out.
 3001                          */
 3002                         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 3003                                 if ((error = bp->b_error) != 0)
 3004                                         break;
 3005                         if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
 3006                             (error == 0 && --maxretry >= 0))
 3007                                 goto loop1;
 3008                         if (error == 0)
 3009                                 error = EAGAIN;
 3010                 }
 3011         }
 3012         BO_UNLOCK(bo);
 3013         if (error != 0)
 3014                 vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 3015 
 3016         return (error);
 3017 }
 3018 
 3019 /*
 3020  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
 3021  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
 3022  * to do the actual copy.
 3023  * vn_generic_copy_file_range() is factored out, so it can be called
 3024  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
 3025  * different file systems.
 3026  */
 3027 int
 3028 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
 3029     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
 3030     struct ucred *outcred, struct thread *fsize_td)
 3031 {
 3032         int error;
 3033         size_t len;
 3034         uint64_t uval;
 3035 
 3036         len = *lenp;
 3037         *lenp = 0;              /* For error returns. */
 3038         error = 0;
 3039 
 3040         /* Do some sanity checks on the arguments. */
 3041         if (invp->v_type == VDIR || outvp->v_type == VDIR)
 3042                 error = EISDIR;
 3043         else if (*inoffp < 0 || *outoffp < 0 ||
 3044             invp->v_type != VREG || outvp->v_type != VREG)
 3045                 error = EINVAL;
 3046         if (error != 0)
 3047                 goto out;
 3048 
 3049         /* Ensure offset + len does not wrap around. */
 3050         uval = *inoffp;
 3051         uval += len;
 3052         if (uval > INT64_MAX)
 3053                 len = INT64_MAX - *inoffp;
 3054         uval = *outoffp;
 3055         uval += len;
 3056         if (uval > INT64_MAX)
 3057                 len = INT64_MAX - *outoffp;
 3058         if (len == 0)
 3059                 goto out;
 3060 
 3061         /*
 3062          * If the two vnode are for the same file system, call
 3063          * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
 3064          * which can handle copies across multiple file systems.
 3065          */
 3066         *lenp = len;
 3067         if (invp->v_mount == outvp->v_mount)
 3068                 error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
 3069                     lenp, flags, incred, outcred, fsize_td);
 3070         else
 3071                 error = vn_generic_copy_file_range(invp, inoffp, outvp,
 3072                     outoffp, lenp, flags, incred, outcred, fsize_td);
 3073 out:
 3074         return (error);
 3075 }
 3076 
 3077 /*
 3078  * Test len bytes of data starting at dat for all bytes == 0.
 3079  * Return true if all bytes are zero, false otherwise.
 3080  * Expects dat to be well aligned.
 3081  */
 3082 static bool
 3083 mem_iszero(void *dat, int len)
 3084 {
 3085         int i;
 3086         const u_int *p;
 3087         const char *cp;
 3088 
 3089         for (p = dat; len > 0; len -= sizeof(*p), p++) {
 3090                 if (len >= sizeof(*p)) {
 3091                         if (*p != 0)
 3092                                 return (false);
 3093                 } else {
 3094                         cp = (const char *)p;
 3095                         for (i = 0; i < len; i++, cp++)
 3096                                 if (*cp != '\0')
 3097                                         return (false);
 3098                 }
 3099         }
 3100         return (true);
 3101 }
 3102 
 3103 /*
 3104  * Look for a hole in the output file and, if found, adjust *outoffp
 3105  * and *xferp to skip past the hole.
 3106  * *xferp is the entire hole length to be written and xfer2 is how many bytes
 3107  * to be written as 0's upon return.
 3108  */
 3109 static off_t
 3110 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
 3111     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
 3112 {
 3113         int error;
 3114         off_t delta;
 3115 
 3116         if (*holeoffp == 0 || *holeoffp <= *outoffp) {
 3117                 *dataoffp = *outoffp;
 3118                 error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
 3119                     curthread);
 3120                 if (error == 0) {
 3121                         *holeoffp = *dataoffp;
 3122                         error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
 3123                             curthread);
 3124                 }
 3125                 if (error != 0 || *holeoffp == *dataoffp) {
 3126                         /*
 3127                          * Since outvp is unlocked, it may be possible for
 3128                          * another thread to do a truncate(), lseek(), write()
 3129                          * creating a hole at startoff between the above
 3130                          * VOP_IOCTL() calls, if the other thread does not do
 3131                          * rangelocking.
 3132                          * If that happens, *holeoffp == *dataoffp and finding
 3133                          * the hole has failed, so disable vn_skip_hole().
 3134                          */
 3135                         *holeoffp = -1; /* Disable use of vn_skip_hole(). */
 3136                         return (xfer2);
 3137                 }
 3138                 KASSERT(*dataoffp >= *outoffp,
 3139                     ("vn_skip_hole: dataoff=%jd < outoff=%jd",
 3140                     (intmax_t)*dataoffp, (intmax_t)*outoffp));
 3141                 KASSERT(*holeoffp > *dataoffp,
 3142                     ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
 3143                     (intmax_t)*holeoffp, (intmax_t)*dataoffp));
 3144         }
 3145 
 3146         /*
 3147          * If there is a hole before the data starts, advance *outoffp and
 3148          * *xferp past the hole.
 3149          */
 3150         if (*dataoffp > *outoffp) {
 3151                 delta = *dataoffp - *outoffp;
 3152                 if (delta >= *xferp) {
 3153                         /* Entire *xferp is a hole. */
 3154                         *outoffp += *xferp;
 3155                         *xferp = 0;
 3156                         return (0);
 3157                 }
 3158                 *xferp -= delta;
 3159                 *outoffp += delta;
 3160                 xfer2 = MIN(xfer2, *xferp);
 3161         }
 3162 
 3163         /*
 3164          * If a hole starts before the end of this xfer2, reduce this xfer2 so
 3165          * that the write ends at the start of the hole.
 3166          * *holeoffp should always be greater than *outoffp, but for the
 3167          * non-INVARIANTS case, check this to make sure xfer2 remains a sane
 3168          * value.
 3169          */
 3170         if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
 3171                 xfer2 = *holeoffp - *outoffp;
 3172         return (xfer2);
 3173 }
 3174 
 3175 /*
 3176  * Write an xfer sized chunk to outvp in blksize blocks from dat.
 3177  * dat is a maximum of blksize in length and can be written repeatedly in
 3178  * the chunk.
 3179  * If growfile == true, just grow the file via vn_truncate_locked() instead
 3180  * of doing actual writes.
 3181  * If checkhole == true, a hole is being punched, so skip over any hole
 3182  * already in the output file.
 3183  */
 3184 static int
 3185 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
 3186     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 3187 {
 3188         struct mount *mp;
 3189         off_t dataoff, holeoff, xfer2;
 3190         int error;
 3191 
 3192         /*
 3193          * Loop around doing writes of blksize until write has been completed.
 3194          * Lock/unlock on each loop iteration so that a bwillwrite() can be
 3195          * done for each iteration, since the xfer argument can be very
 3196          * large if there is a large hole to punch in the output file.
 3197          */
 3198         error = 0;
 3199         holeoff = 0;
 3200         do {
 3201                 xfer2 = MIN(xfer, blksize);
 3202                 if (checkhole) {
 3203                         /*
 3204                          * Punching a hole.  Skip writing if there is
 3205                          * already a hole in the output file.
 3206                          */
 3207                         xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
 3208                             &dataoff, &holeoff, cred);
 3209                         if (xfer == 0)
 3210                                 break;
 3211                         if (holeoff < 0)
 3212                                 checkhole = false;
 3213                         KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
 3214                             (intmax_t)xfer2));
 3215                 }
 3216                 bwillwrite();
 3217                 mp = NULL;
 3218                 error = vn_start_write(outvp, &mp, V_WAIT);
 3219                 if (error != 0)
 3220                         break;
 3221                 if (growfile) {
 3222                         error = vn_lock(outvp, LK_EXCLUSIVE);
 3223                         if (error == 0) {
 3224                                 error = vn_truncate_locked(outvp, outoff + xfer,
 3225                                     false, cred);
 3226                                 VOP_UNLOCK(outvp);
 3227                         }
 3228                 } else {
 3229                         error = vn_lock(outvp, vn_lktype_write(mp, outvp));
 3230                         if (error == 0) {
 3231                                 error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 3232                                     outoff, UIO_SYSSPACE, IO_NODELOCKED,
 3233                                     curthread->td_ucred, cred, NULL, curthread);
 3234                                 outoff += xfer2;
 3235                                 xfer -= xfer2;
 3236                                 VOP_UNLOCK(outvp);
 3237                         }
 3238                 }
 3239                 if (mp != NULL)
 3240                         vn_finished_write(mp);
 3241         } while (!growfile && xfer > 0 && error == 0);
 3242         return (error);
 3243 }
 3244 
 3245 /*
 3246  * Copy a byte range of one file to another.  This function can handle the
 3247  * case where invp and outvp are on different file systems.
 3248  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
 3249  * is no better file system specific way to do it.
 3250  */
 3251 int
 3252 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
 3253     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
 3254     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
 3255 {
 3256         struct mount *mp;
 3257         off_t startoff, endoff, xfer, xfer2;
 3258         u_long blksize;
 3259         int error, interrupted;
 3260         bool cantseek, readzeros, eof, lastblock, holetoeof;
 3261         ssize_t aresid, r = 0;
 3262         size_t copylen, len, savlen;
 3263         off_t insize, outsize;
 3264         char *dat;
 3265         long holein, holeout;
 3266         struct timespec curts, endts;
 3267 
 3268         holein = holeout = 0;
 3269         savlen = len = *lenp;
 3270         error = 0;
 3271         interrupted = 0;
 3272         dat = NULL;
 3273 
 3274         error = vn_lock(invp, LK_SHARED);
 3275         if (error != 0)
 3276                 goto out;
 3277         if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
 3278                 holein = 0;
 3279         if (holein > 0)
 3280                 error = vn_getsize_locked(invp, &insize, incred);
 3281         VOP_UNLOCK(invp);
 3282         if (error != 0)
 3283                 goto out;
 3284 
 3285         mp = NULL;
 3286         error = vn_start_write(outvp, &mp, V_WAIT);
 3287         if (error == 0)
 3288                 error = vn_lock(outvp, LK_EXCLUSIVE);
 3289         if (error == 0) {
 3290                 /*
 3291                  * If fsize_td != NULL, do a vn_rlimit_fsizex() call,
 3292                  * now that outvp is locked.
 3293                  */
 3294                 if (fsize_td != NULL) {
 3295                         struct uio io;
 3296 
 3297                         io.uio_offset = *outoffp;
 3298                         io.uio_resid = len;
 3299                         error = vn_rlimit_fsizex(outvp, &io, 0, &r, fsize_td);
 3300                         len = savlen = io.uio_resid;
 3301                         /*
 3302                          * No need to call vn_rlimit_fsizex_res before return,
 3303                          * since the uio is local.
 3304                          */
 3305                 }
 3306                 if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
 3307                         holeout = 0;
 3308                 /*
 3309                  * Holes that are past EOF do not need to be written as a block
 3310                  * of zero bytes.  So, truncate the output file as far as
 3311                  * possible and then use size to decide if writing 0
 3312                  * bytes is necessary in the loop below.
 3313                  */
 3314                 if (error == 0)
 3315                         error = vn_getsize_locked(outvp, &outsize, outcred);
 3316                 if (error == 0 && outsize > *outoffp && outsize <= *outoffp + len) {
 3317 #ifdef MAC
 3318                         error = mac_vnode_check_write(curthread->td_ucred,
 3319                             outcred, outvp);
 3320                         if (error == 0)
 3321 #endif
 3322                                 error = vn_truncate_locked(outvp, *outoffp,
 3323                                     false, outcred);
 3324                         if (error == 0)
 3325                                 outsize = *outoffp;
 3326                 }
 3327                 VOP_UNLOCK(outvp);
 3328         }
 3329         if (mp != NULL)
 3330                 vn_finished_write(mp);
 3331         if (error != 0)
 3332                 goto out;
 3333 
 3334         if (holein == 0 && holeout > 0) {
 3335                 /*
 3336                  * For this special case, the input data will be scanned
 3337                  * for blocks of all 0 bytes.  For these blocks, the
 3338                  * write can be skipped for the output file to create
 3339                  * an unallocated region.
 3340                  * Therefore, use the appropriate size for the output file.
 3341                  */
 3342                 blksize = holeout;
 3343                 if (blksize <= 512) {
 3344                         /*
 3345                          * Use f_iosize, since ZFS reports a _PC_MIN_HOLE_SIZE
 3346                          * of 512, although it actually only creates
 3347                          * unallocated regions for blocks >= f_iosize.
 3348                          */
 3349                         blksize = outvp->v_mount->mnt_stat.f_iosize;
 3350                 }
 3351         } else {
 3352                 /*
 3353                  * Use the larger of the two f_iosize values.  If they are
 3354                  * not the same size, one will normally be an exact multiple of
 3355                  * the other, since they are both likely to be a power of 2.
 3356                  */
 3357                 blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
 3358                     outvp->v_mount->mnt_stat.f_iosize);
 3359         }
 3360 
 3361         /* Clip to sane limits. */
 3362         if (blksize < 4096)
 3363                 blksize = 4096;
 3364         else if (blksize > maxphys)
 3365                 blksize = maxphys;
 3366         dat = malloc(blksize, M_TEMP, M_WAITOK);
 3367 
 3368         /*
 3369          * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
 3370          * to find holes.  Otherwise, just scan the read block for all 0s
 3371          * in the inner loop where the data copying is done.
 3372          * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
 3373          * support holes on the server, but do not support FIOSEEKHOLE.
 3374          * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
 3375          * that this function should return after 1second with a partial
 3376          * completion.
 3377          */
 3378         if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
 3379                 getnanouptime(&endts);
 3380                 endts.tv_sec++;
 3381         } else
 3382                 timespecclear(&endts);
 3383         holetoeof = eof = false;
 3384         while (len > 0 && error == 0 && !eof && interrupted == 0) {
 3385                 endoff = 0;                     /* To shut up compilers. */
 3386                 cantseek = true;
 3387                 startoff = *inoffp;
 3388                 copylen = len;
 3389 
 3390                 /*
 3391                  * Find the next data area.  If there is just a hole to EOF,
 3392                  * FIOSEEKDATA should fail with ENXIO.
 3393                  * (I do not know if any file system will report a hole to
 3394                  *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
 3395                  *  will fail for those file systems.)
 3396                  *
 3397                  * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
 3398                  * the code just falls through to the inner copy loop.
 3399                  */
 3400                 error = EINVAL;
 3401                 if (holein > 0) {
 3402                         error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
 3403                             incred, curthread);
 3404                         if (error == ENXIO) {
 3405                                 startoff = endoff = insize;
 3406                                 eof = holetoeof = true;
 3407                                 error = 0;
 3408                         }
 3409                 }
 3410                 if (error == 0 && !holetoeof) {
 3411                         endoff = startoff;
 3412                         error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
 3413                             incred, curthread);
 3414                         /*
 3415                          * Since invp is unlocked, it may be possible for
 3416                          * another thread to do a truncate(), lseek(), write()
 3417                          * creating a hole at startoff between the above
 3418                          * VOP_IOCTL() calls, if the other thread does not do
 3419                          * rangelocking.
 3420                          * If that happens, startoff == endoff and finding
 3421                          * the hole has failed, so set an error.
 3422                          */
 3423                         if (error == 0 && startoff == endoff)
 3424                                 error = EINVAL; /* Any error. Reset to 0. */
 3425                 }
 3426                 if (error == 0) {
 3427                         if (startoff > *inoffp) {
 3428                                 /* Found hole before data block. */
 3429                                 xfer = MIN(startoff - *inoffp, len);
 3430                                 if (*outoffp < outsize) {
 3431                                         /* Must write 0s to punch hole. */
 3432                                         xfer2 = MIN(outsize - *outoffp,
 3433                                             xfer);
 3434                                         memset(dat, 0, MIN(xfer2, blksize));
 3435                                         error = vn_write_outvp(outvp, dat,
 3436                                             *outoffp, xfer2, blksize, false,
 3437                                             holeout > 0, outcred);
 3438                                 }
 3439 
 3440                                 if (error == 0 && *outoffp + xfer >
 3441                                     outsize && (xfer == len || holetoeof)) {
 3442                                         /* Grow output file (hole at end). */
 3443                                         error = vn_write_outvp(outvp, dat,
 3444                                             *outoffp, xfer, blksize, true,
 3445                                             false, outcred);
 3446                                 }
 3447                                 if (error == 0) {
 3448                                         *inoffp += xfer;
 3449                                         *outoffp += xfer;
 3450                                         len -= xfer;
 3451                                         if (len < savlen) {
 3452                                                 interrupted = sig_intr();
 3453                                                 if (timespecisset(&endts) &&
 3454                                                     interrupted == 0) {
 3455                                                         getnanouptime(&curts);
 3456                                                         if (timespeccmp(&curts,
 3457                                                             &endts, >=))
 3458                                                                 interrupted =
 3459                                                                     EINTR;
 3460                                                 }
 3461                                         }
 3462                                 }
 3463                         }
 3464                         copylen = MIN(len, endoff - startoff);
 3465                         cantseek = false;
 3466                 } else {
 3467                         cantseek = true;
 3468                         startoff = *inoffp;
 3469                         copylen = len;
 3470                         error = 0;
 3471                 }
 3472 
 3473                 xfer = blksize;
 3474                 if (cantseek) {
 3475                         /*
 3476                          * Set first xfer to end at a block boundary, so that
 3477                          * holes are more likely detected in the loop below via
 3478                          * the for all bytes 0 method.
 3479                          */
 3480                         xfer -= (*inoffp % blksize);
 3481                 }
 3482                 /* Loop copying the data block. */
 3483                 while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
 3484                         if (copylen < xfer)
 3485                                 xfer = copylen;
 3486                         error = vn_lock(invp, LK_SHARED);
 3487                         if (error != 0)
 3488                                 goto out;
 3489                         error = vn_rdwr(UIO_READ, invp, dat, xfer,
 3490                             startoff, UIO_SYSSPACE, IO_NODELOCKED,
 3491                             curthread->td_ucred, incred, &aresid,
 3492                             curthread);
 3493                         VOP_UNLOCK(invp);
 3494                         lastblock = false;
 3495                         if (error == 0 && aresid > 0) {
 3496                                 /* Stop the copy at EOF on the input file. */
 3497                                 xfer -= aresid;
 3498                                 eof = true;
 3499                                 lastblock = true;
 3500                         }
 3501                         if (error == 0) {
 3502                                 /*
 3503                                  * Skip the write for holes past the initial EOF
 3504                                  * of the output file, unless this is the last
 3505                                  * write of the output file at EOF.
 3506                                  */
 3507                                 readzeros = cantseek ? mem_iszero(dat, xfer) :
 3508                                     false;
 3509                                 if (xfer == len)
 3510                                         lastblock = true;
 3511                                 if (!cantseek || *outoffp < outsize ||
 3512                                     lastblock || !readzeros)
 3513                                         error = vn_write_outvp(outvp, dat,
 3514                                             *outoffp, xfer, blksize,
 3515                                             readzeros && lastblock &&
 3516                                             *outoffp >= outsize, false,
 3517                                             outcred);
 3518                                 if (error == 0) {
 3519                                         *inoffp += xfer;
 3520                                         startoff += xfer;
 3521                                         *outoffp += xfer;
 3522                                         copylen -= xfer;
 3523                                         len -= xfer;
 3524                                         if (len < savlen) {
 3525                                                 interrupted = sig_intr();
 3526                                                 if (timespecisset(&endts) &&
 3527                                                     interrupted == 0) {
 3528                                                         getnanouptime(&curts);
 3529                                                         if (timespeccmp(&curts,
 3530                                                             &endts, >=))
 3531                                                                 interrupted =
 3532                                                                     EINTR;
 3533                                                 }
 3534                                         }
 3535                                 }
 3536                         }
 3537                         xfer = blksize;
 3538                 }
 3539         }
 3540 out:
 3541         *lenp = savlen - len;
 3542         free(dat, M_TEMP);
 3543         return (error);
 3544 }
 3545 
 3546 static int
 3547 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 3548 {
 3549         struct mount *mp;
 3550         struct vnode *vp;
 3551         off_t olen, ooffset;
 3552         int error;
 3553 #ifdef AUDIT
 3554         int audited_vnode1 = 0;
 3555 #endif
 3556 
 3557         vp = fp->f_vnode;
 3558         if (vp->v_type != VREG)
 3559                 return (ENODEV);
 3560 
 3561         /* Allocating blocks may take a long time, so iterate. */
 3562         for (;;) {
 3563                 olen = len;
 3564                 ooffset = offset;
 3565 
 3566                 bwillwrite();
 3567                 mp = NULL;
 3568                 error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
 3569                 if (error != 0)
 3570                         break;
 3571                 error = vn_lock(vp, LK_EXCLUSIVE);
 3572                 if (error != 0) {
 3573                         vn_finished_write(mp);
 3574                         break;
 3575                 }
 3576 #ifdef AUDIT
 3577                 if (!audited_vnode1) {
 3578                         AUDIT_ARG_VNODE1(vp);
 3579                         audited_vnode1 = 1;
 3580                 }
 3581 #endif
 3582 #ifdef MAC
 3583                 error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 3584                 if (error == 0)
 3585 #endif
 3586                         error = VOP_ALLOCATE(vp, &offset, &len, 0,
 3587                             td->td_ucred);
 3588                 VOP_UNLOCK(vp);
 3589                 vn_finished_write(mp);
 3590 
 3591                 if (olen + ooffset != offset + len) {
 3592                         panic("offset + len changed from %jx/%jx to %jx/%jx",
 3593                             ooffset, olen, offset, len);
 3594                 }
 3595                 if (error != 0 || len == 0)
 3596                         break;
 3597                 KASSERT(olen > len, ("Iteration did not make progress?"));
 3598                 maybe_yield();
 3599         }
 3600 
 3601         return (error);
 3602 }
 3603 
 3604 static int
 3605 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
 3606     int ioflag, struct ucred *cred, struct ucred *active_cred,
 3607     struct ucred *file_cred)
 3608 {
 3609         struct mount *mp;
 3610         void *rl_cookie;
 3611         off_t off, len;
 3612         int error;
 3613 #ifdef AUDIT
 3614         bool audited_vnode1 = false;
 3615 #endif
 3616 
 3617         rl_cookie = NULL;
 3618         error = 0;
 3619         mp = NULL;
 3620         off = *offset;
 3621         len = *length;
 3622 
 3623         if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
 3624                 rl_cookie = vn_rangelock_wlock(vp, off, off + len);
 3625         while (len > 0 && error == 0) {
 3626                 /*
 3627                  * Try to deallocate the longest range in one pass.
 3628                  * In case a pass takes too long to be executed, it returns
 3629                  * partial result. The residue will be proceeded in the next
 3630                  * pass.
 3631                  */
 3632 
 3633                 if ((ioflag & IO_NODELOCKED) == 0) {
 3634                         bwillwrite();
 3635                         if ((error = vn_start_write(vp, &mp,
 3636                             V_WAIT | V_PCATCH)) != 0)
 3637                                 goto out;
 3638                         vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 3639                 }
 3640 #ifdef AUDIT
 3641                 if (!audited_vnode1) {
 3642                         AUDIT_ARG_VNODE1(vp);
 3643                         audited_vnode1 = true;
 3644                 }
 3645 #endif
 3646 
 3647 #ifdef MAC
 3648                 if ((ioflag & IO_NOMACCHECK) == 0)
 3649                         error = mac_vnode_check_write(active_cred, file_cred,
 3650                             vp);
 3651 #endif
 3652                 if (error == 0)
 3653                         error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
 3654                             cred);
 3655 
 3656                 if ((ioflag & IO_NODELOCKED) == 0) {
 3657                         VOP_UNLOCK(vp);
 3658                         if (mp != NULL) {
 3659                                 vn_finished_write(mp);
 3660                                 mp = NULL;
 3661                         }
 3662                 }
 3663                 if (error == 0 && len != 0)
 3664                         maybe_yield();
 3665         }
 3666 out:
 3667         if (rl_cookie != NULL)
 3668                 vn_rangelock_unlock(vp, rl_cookie);
 3669         *offset = off;
 3670         *length = len;
 3671         return (error);
 3672 }
 3673 
 3674 /*
 3675  * This function is supposed to be used in the situations where the deallocation
 3676  * is not triggered by a user request.
 3677  */
 3678 int
 3679 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
 3680     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
 3681 {
 3682         struct ucred *cred;
 3683 
 3684         if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
 3685             flags != 0)
 3686                 return (EINVAL);
 3687         if (vp->v_type != VREG)
 3688                 return (ENODEV);
 3689 
 3690         cred = file_cred != NOCRED ? file_cred : active_cred;
 3691         return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
 3692             active_cred, file_cred));
 3693 }
 3694 
 3695 static int
 3696 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
 3697     struct ucred *active_cred, struct thread *td)
 3698 {
 3699         int error;
 3700         struct vnode *vp;
 3701         int ioflag;
 3702 
 3703         KASSERT(cmd == SPACECTL_DEALLOC, ("vn_fspacectl: Invalid cmd"));
 3704         KASSERT((flags & ~SPACECTL_F_SUPPORTED) == 0,
 3705             ("vn_fspacectl: non-zero flags"));
 3706         KASSERT(*offset >= 0 && *length > 0 && *length <= OFF_MAX - *offset,
 3707             ("vn_fspacectl: offset/length overflow or underflow"));
 3708         vp = fp->f_vnode;
 3709 
 3710         if (vp->v_type != VREG)
 3711                 return (ENODEV);
 3712 
 3713         ioflag = get_write_ioflag(fp);
 3714 
 3715         switch (cmd) {
 3716         case SPACECTL_DEALLOC:
 3717                 error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
 3718                     active_cred, active_cred, fp->f_cred);
 3719                 break;
 3720         default:
 3721                 panic("vn_fspacectl: unknown cmd %d", cmd);
 3722         }
 3723 
 3724         return (error);
 3725 }
 3726 
 3727 static u_long vn_lock_pair_pause_cnt;
 3728 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
 3729     &vn_lock_pair_pause_cnt, 0,
 3730     "Count of vn_lock_pair deadlocks");
 3731 
 3732 u_int vn_lock_pair_pause_max;
 3733 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
 3734     &vn_lock_pair_pause_max, 0,
 3735     "Max ticks for vn_lock_pair deadlock avoidance sleep");
 3736 
 3737 static void
 3738 vn_lock_pair_pause(const char *wmesg)
 3739 {
 3740         atomic_add_long(&vn_lock_pair_pause_cnt, 1);
 3741         pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
 3742 }
 3743 
 3744 /*
 3745  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
 3746  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
 3747  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
 3748  * can be NULL.
 3749  *
 3750  * The function returns with both vnodes exclusively locked, and
 3751  * guarantees that it does not create lock order reversal with other
 3752  * threads during its execution.  Both vnodes could be unlocked
 3753  * temporary (and reclaimed).
 3754  */
 3755 void
 3756 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
 3757     bool vp2_locked)
 3758 {
 3759         int error;
 3760 
 3761         if (vp1 == NULL && vp2 == NULL)
 3762                 return;
 3763         if (vp1 != NULL) {
 3764                 if (vp1_locked)
 3765                         ASSERT_VOP_ELOCKED(vp1, "vp1");
 3766                 else
 3767                         ASSERT_VOP_UNLOCKED(vp1, "vp1");
 3768         } else {
 3769                 vp1_locked = true;
 3770         }
 3771         if (vp2 != NULL) {
 3772                 if (vp2_locked)
 3773                         ASSERT_VOP_ELOCKED(vp2, "vp2");
 3774                 else
 3775                         ASSERT_VOP_UNLOCKED(vp2, "vp2");
 3776         } else {
 3777                 vp2_locked = true;
 3778         }
 3779         if (!vp1_locked && !vp2_locked) {
 3780                 vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 3781                 vp1_locked = true;
 3782         }
 3783 
 3784         for (;;) {
 3785                 if (vp1_locked && vp2_locked)
 3786                         break;
 3787                 if (vp1_locked && vp2 != NULL) {
 3788                         if (vp1 != NULL) {
 3789                                 error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
 3790                                     __FILE__, __LINE__);
 3791                                 if (error == 0)
 3792                                         break;
 3793                                 VOP_UNLOCK(vp1);
 3794                                 vp1_locked = false;
 3795                                 vn_lock_pair_pause("vlp1");
 3796                         }
 3797                         vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
 3798                         vp2_locked = true;
 3799                 }
 3800                 if (vp2_locked && vp1 != NULL) {
 3801                         if (vp2 != NULL) {
 3802                                 error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
 3803                                     __FILE__, __LINE__);
 3804                                 if (error == 0)
 3805                                         break;
 3806                                 VOP_UNLOCK(vp2);
 3807                                 vp2_locked = false;
 3808                                 vn_lock_pair_pause("vlp2");
 3809                         }
 3810                         vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 3811                         vp1_locked = true;
 3812                 }
 3813         }
 3814         if (vp1 != NULL)
 3815                 ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
 3816         if (vp2 != NULL)
 3817                 ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
 3818 }
 3819 
 3820 int
 3821 vn_lktype_write(struct mount *mp, struct vnode *vp)
 3822 {
 3823         if (MNT_SHARED_WRITES(mp) ||
 3824             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
 3825                 return (LK_SHARED);
 3826         return (LK_EXCLUSIVE);
 3827 }

Cache object: 7fc2e9acc0dde2df0514995b4f57d50f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.