vfs_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
   11  * Copyright (c) 2013, 2014 The FreeBSD Foundation
   12  *
   13  * Portions of this software were developed by Konstantin Belousov
   14  * under sponsorship from the FreeBSD Foundation.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  * 4. Neither the name of the University nor the names of its contributors
   25  *    may be used to endorse or promote products derived from this software
   26  *    without specific prior written permission.
   27  *
   28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   38  * SUCH DAMAGE.
   39  *
   40  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
   41  */
   42 
   43 #include <sys/cdefs.h>
   44 __FBSDID("$FreeBSD: releng/11.2/sys/kern/vfs_vnops.c 338606 2018-09-12 05:07:35Z gordon $");
   45 
   46 #include "opt_hwpmc_hooks.h"
   47 
   48 #include <sys/param.h>
   49 #include <sys/systm.h>
   50 #include <sys/disk.h>
   51 #include <sys/fail.h>
   52 #include <sys/fcntl.h>
   53 #include <sys/file.h>
   54 #include <sys/kdb.h>
   55 #include <sys/stat.h>
   56 #include <sys/priv.h>
   57 #include <sys/proc.h>
   58 #include <sys/limits.h>
   59 #include <sys/lock.h>
   60 #include <sys/mman.h>
   61 #include <sys/mount.h>
   62 #include <sys/mutex.h>
   63 #include <sys/namei.h>
   64 #include <sys/vnode.h>
   65 #include <sys/bio.h>
   66 #include <sys/buf.h>
   67 #include <sys/filio.h>
   68 #include <sys/resourcevar.h>
   69 #include <sys/rwlock.h>
   70 #include <sys/sx.h>
   71 #include <sys/sysctl.h>
   72 #include <sys/ttycom.h>
   73 #include <sys/conf.h>
   74 #include <sys/syslog.h>
   75 #include <sys/unistd.h>
   76 #include <sys/user.h>
   77 
   78 #include <security/audit/audit.h>
   79 #include <security/mac/mac_framework.h>
   80 
   81 #include <vm/vm.h>
   82 #include <vm/vm_extern.h>
   83 #include <vm/pmap.h>
   84 #include <vm/vm_map.h>
   85 #include <vm/vm_object.h>
   86 #include <vm/vm_page.h>
   87 #include <vm/vnode_pager.h>
   88 
   89 #ifdef HWPMC_HOOKS
   90 #include <sys/pmckern.h>
   91 #endif
   92 
   93 static fo_rdwr_t        vn_read;
   94 static fo_rdwr_t        vn_write;
   95 static fo_rdwr_t        vn_io_fault;
   96 static fo_truncate_t    vn_truncate;
   97 static fo_ioctl_t       vn_ioctl;
   98 static fo_poll_t        vn_poll;
   99 static fo_kqfilter_t    vn_kqfilter;
  100 static fo_stat_t        vn_statfile;
  101 static fo_close_t       vn_closefile;
  102 static fo_mmap_t        vn_mmap;
  103 
  104 struct  fileops vnops = {
  105         .fo_read = vn_io_fault,
  106         .fo_write = vn_io_fault,
  107         .fo_truncate = vn_truncate,
  108         .fo_ioctl = vn_ioctl,
  109         .fo_poll = vn_poll,
  110         .fo_kqfilter = vn_kqfilter,
  111         .fo_stat = vn_statfile,
  112         .fo_close = vn_closefile,
  113         .fo_chmod = vn_chmod,
  114         .fo_chown = vn_chown,
  115         .fo_sendfile = vn_sendfile,
  116         .fo_seek = vn_seek,
  117         .fo_fill_kinfo = vn_fill_kinfo,
  118         .fo_mmap = vn_mmap,
  119         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
  120 };
  121 
  122 static const int io_hold_cnt = 16;
  123 static int vn_io_fault_enable = 1;
  124 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RW,
  125     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
  126 static int vn_io_fault_prefault = 0;
  127 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RW,
  128     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
  129 static u_long vn_io_faults_cnt;
  130 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
  131     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
  132 
  133 /*
  134  * Returns true if vn_io_fault mode of handling the i/o request should
  135  * be used.
  136  */
  137 static bool
  138 do_vn_io_fault(struct vnode *vp, struct uio *uio)
  139 {
  140         struct mount *mp;
  141 
  142         return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
  143             (mp = vp->v_mount) != NULL &&
  144             (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
  145 }
  146 
  147 /*
  148  * Structure used to pass arguments to vn_io_fault1(), to do either
  149  * file- or vnode-based I/O calls.
  150  */
  151 struct vn_io_fault_args {
  152         enum {
  153                 VN_IO_FAULT_FOP,
  154                 VN_IO_FAULT_VOP
  155         } kind;
  156         struct ucred *cred;
  157         int flags;
  158         union {
  159                 struct fop_args_tag {
  160                         struct file *fp;
  161                         fo_rdwr_t *doio;
  162                 } fop_args;
  163                 struct vop_args_tag {
  164                         struct vnode *vp;
  165                 } vop_args;
  166         } args;
  167 };
  168 
  169 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
  170     struct vn_io_fault_args *args, struct thread *td);
  171 
  172 int
  173 vn_open(ndp, flagp, cmode, fp)
  174         struct nameidata *ndp;
  175         int *flagp, cmode;
  176         struct file *fp;
  177 {
  178         struct thread *td = ndp->ni_cnd.cn_thread;
  179 
  180         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
  181 }
  182 
  183 /*
  184  * Common code for vnode open operations via a name lookup.
  185  * Lookup the vnode and invoke VOP_CREATE if needed.
  186  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  187  * 
  188  * Note that this does NOT free nameidata for the successful case,
  189  * due to the NDINIT being done elsewhere.
  190  */
  191 int
  192 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
  193     struct ucred *cred, struct file *fp)
  194 {
  195         struct vnode *vp;
  196         struct mount *mp;
  197         struct thread *td = ndp->ni_cnd.cn_thread;
  198         struct vattr vat;
  199         struct vattr *vap = &vat;
  200         int fmode, error;
  201 
  202 restart:
  203         fmode = *flagp;
  204         if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
  205             O_EXCL | O_DIRECTORY))
  206                 return (EINVAL);
  207         else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
  208                 ndp->ni_cnd.cn_nameiop = CREATE;
  209                 /*
  210                  * Set NOCACHE to avoid flushing the cache when
  211                  * rolling in many files at once.
  212                 */
  213                 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | NOCACHE;
  214                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
  215                         ndp->ni_cnd.cn_flags |= FOLLOW;
  216                 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
  217                         ndp->ni_cnd.cn_flags |= AUDITVNODE1;
  218                 if (vn_open_flags & VN_OPEN_NOCAPCHECK)
  219                         ndp->ni_cnd.cn_flags |= NOCAPCHECK;
  220                 bwillwrite();
  221                 if ((error = namei(ndp)) != 0)
  222                         return (error);
  223                 if (ndp->ni_vp == NULL) {
  224                         VATTR_NULL(vap);
  225                         vap->va_type = VREG;
  226                         vap->va_mode = cmode;
  227                         if (fmode & O_EXCL)
  228                                 vap->va_vaflags |= VA_EXCLUSIVE;
  229                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
  230                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  231                                 vput(ndp->ni_dvp);
  232                                 if ((error = vn_start_write(NULL, &mp,
  233                                     V_XSLEEP | PCATCH)) != 0)
  234                                         return (error);
  235                                 goto restart;
  236                         }
  237                         if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
  238                                 ndp->ni_cnd.cn_flags |= MAKEENTRY;
  239 #ifdef MAC
  240                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
  241                             &ndp->ni_cnd, vap);
  242                         if (error == 0)
  243 #endif
  244                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
  245                                                    &ndp->ni_cnd, vap);
  246                         vput(ndp->ni_dvp);
  247                         vn_finished_write(mp);
  248                         if (error) {
  249                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  250                                 return (error);
  251                         }
  252                         fmode &= ~O_TRUNC;
  253                         vp = ndp->ni_vp;
  254                 } else {
  255                         if (ndp->ni_dvp == ndp->ni_vp)
  256                                 vrele(ndp->ni_dvp);
  257                         else
  258                                 vput(ndp->ni_dvp);
  259                         ndp->ni_dvp = NULL;
  260                         vp = ndp->ni_vp;
  261                         if (fmode & O_EXCL) {
  262                                 error = EEXIST;
  263                                 goto bad;
  264                         }
  265                         fmode &= ~O_CREAT;
  266                 }
  267         } else {
  268                 ndp->ni_cnd.cn_nameiop = LOOKUP;
  269                 ndp->ni_cnd.cn_flags = ISOPEN |
  270                     ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
  271                 if (!(fmode & FWRITE))
  272                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
  273                 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
  274                         ndp->ni_cnd.cn_flags |= AUDITVNODE1;
  275                 if (vn_open_flags & VN_OPEN_NOCAPCHECK)
  276                         ndp->ni_cnd.cn_flags |= NOCAPCHECK;
  277                 if ((error = namei(ndp)) != 0)
  278                         return (error);
  279                 vp = ndp->ni_vp;
  280         }
  281         error = vn_open_vnode(vp, fmode, cred, td, fp);
  282         if (error)
  283                 goto bad;
  284         *flagp = fmode;
  285         return (0);
  286 bad:
  287         NDFREE(ndp, NDF_ONLY_PNBUF);
  288         vput(vp);
  289         *flagp = fmode;
  290         ndp->ni_vp = NULL;
  291         return (error);
  292 }
  293 
  294 /*
  295  * Common code for vnode open operations once a vnode is located.
  296  * Check permissions, and call the VOP_OPEN routine.
  297  */
  298 int
  299 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
  300     struct thread *td, struct file *fp)
  301 {
  302         accmode_t accmode;
  303         struct flock lf;
  304         int error, lock_flags, type;
  305 
  306         if (vp->v_type == VLNK)
  307                 return (EMLINK);
  308         if (vp->v_type == VSOCK)
  309                 return (EOPNOTSUPP);
  310         if (vp->v_type != VDIR && fmode & O_DIRECTORY)
  311                 return (ENOTDIR);
  312         accmode = 0;
  313         if (fmode & (FWRITE | O_TRUNC)) {
  314                 if (vp->v_type == VDIR)
  315                         return (EISDIR);
  316                 accmode |= VWRITE;
  317         }
  318         if (fmode & FREAD)
  319                 accmode |= VREAD;
  320         if (fmode & FEXEC)
  321                 accmode |= VEXEC;
  322         if ((fmode & O_APPEND) && (fmode & FWRITE))
  323                 accmode |= VAPPEND;
  324 #ifdef MAC
  325         if (fmode & O_CREAT)
  326                 accmode |= VCREAT;
  327         if (fmode & O_VERIFY)
  328                 accmode |= VVERIFY;
  329         error = mac_vnode_check_open(cred, vp, accmode);
  330         if (error)
  331                 return (error);
  332 
  333         accmode &= ~(VCREAT | VVERIFY);
  334 #endif
  335         if ((fmode & O_CREAT) == 0) {
  336                 if (accmode & VWRITE) {
  337                         error = vn_writechk(vp);
  338                         if (error)
  339                                 return (error);
  340                 }
  341                 if (accmode) {
  342                         error = VOP_ACCESS(vp, accmode, cred, td);
  343                         if (error)
  344                                 return (error);
  345                 }
  346         }
  347         if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
  348                 vn_lock(vp, LK_UPGRADE | LK_RETRY);
  349         if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
  350                 return (error);
  351 
  352         while ((fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
  353                 KASSERT(fp != NULL, ("open with flock requires fp"));
  354                 if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE) {
  355                         error = EOPNOTSUPP;
  356                         break;
  357                 }
  358                 lock_flags = VOP_ISLOCKED(vp);
  359                 VOP_UNLOCK(vp, 0);
  360                 lf.l_whence = SEEK_SET;
  361                 lf.l_start = 0;
  362                 lf.l_len = 0;
  363                 if (fmode & O_EXLOCK)
  364                         lf.l_type = F_WRLCK;
  365                 else
  366                         lf.l_type = F_RDLCK;
  367                 type = F_FLOCK;
  368                 if ((fmode & FNONBLOCK) == 0)
  369                         type |= F_WAIT;
  370                 error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
  371                 if (error == 0)
  372                         fp->f_flag |= FHASLOCK;
  373                 vn_lock(vp, lock_flags | LK_RETRY);
  374                 if (error != 0)
  375                         break;
  376                 if ((vp->v_iflag & VI_DOOMED) != 0) {
  377                         error = ENOENT;
  378                         break;
  379                 }
  380 
  381                 /*
  382                  * Another thread might have used this vnode as an
  383                  * executable while the vnode lock was dropped.
  384                  * Ensure the vnode is still able to be opened for
  385                  * writing after the lock has been obtained.
  386                  */
  387                 if ((accmode & VWRITE) != 0)
  388                         error = vn_writechk(vp);
  389                 break;
  390         }
  391 
  392         if (error != 0) {
  393                 fp->f_flag |= FOPENFAILED;
  394                 fp->f_vnode = vp;
  395                 if (fp->f_ops == &badfileops) {
  396                         fp->f_type = DTYPE_VNODE;
  397                         fp->f_ops = &vnops;
  398                 }
  399                 vref(vp);
  400         } else if  ((fmode & FWRITE) != 0) {
  401                 VOP_ADD_WRITECOUNT(vp, 1);
  402                 CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
  403                     __func__, vp, vp->v_writecount);
  404         }
  405         ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
  406         return (error);
  407 }
  408 
  409 /*
  410  * Check for write permissions on the specified vnode.
  411  * Prototype text segments cannot be written.
  412  */
  413 int
  414 vn_writechk(struct vnode *vp)
  415 {
  416 
  417         ASSERT_VOP_LOCKED(vp, "vn_writechk");
  418         /*
  419          * If there's shared text associated with
  420          * the vnode, try to free it up once.  If
  421          * we fail, we can't allow writing.
  422          */
  423         if (VOP_IS_TEXT(vp))
  424                 return (ETXTBSY);
  425 
  426         return (0);
  427 }
  428 
  429 /*
  430  * Vnode close call
  431  */
  432 static int
  433 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
  434     struct thread *td, bool keep_ref)
  435 {
  436         struct mount *mp;
  437         int error, lock_flags;
  438 
  439         if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
  440             MNT_EXTENDED_SHARED(vp->v_mount))
  441                 lock_flags = LK_SHARED;
  442         else
  443                 lock_flags = LK_EXCLUSIVE;
  444 
  445         vn_start_write(vp, &mp, V_WAIT);
  446         vn_lock(vp, lock_flags | LK_RETRY);
  447         AUDIT_ARG_VNODE1(vp);
  448         if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
  449                 VNASSERT(vp->v_writecount > 0, vp, 
  450                     ("vn_close: negative writecount"));
  451                 VOP_ADD_WRITECOUNT(vp, -1);
  452                 CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
  453                     __func__, vp, vp->v_writecount);
  454         }
  455         error = VOP_CLOSE(vp, flags, file_cred, td);
  456         if (keep_ref)
  457                 VOP_UNLOCK(vp, 0);
  458         else
  459                 vput(vp);
  460         vn_finished_write(mp);
  461         return (error);
  462 }
  463 
  464 int
  465 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
  466     struct thread *td)
  467 {
  468 
  469         return (vn_close1(vp, flags, file_cred, td, false));
  470 }
  471 
  472 /*
  473  * Heuristic to detect sequential operation.
  474  */
  475 static int
  476 sequential_heuristic(struct uio *uio, struct file *fp)
  477 {
  478 
  479         ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
  480         if (fp->f_flag & FRDAHEAD)
  481                 return (fp->f_seqcount << IO_SEQSHIFT);
  482 
  483         /*
  484          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
  485          * that the first I/O is normally considered to be slightly
  486          * sequential.  Seeking to offset 0 doesn't change sequentiality
  487          * unless previous seeks have reduced f_seqcount to 0, in which
  488          * case offset 0 is not special.
  489          */
  490         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
  491             uio->uio_offset == fp->f_nextoff) {
  492                 /*
  493                  * f_seqcount is in units of fixed-size blocks so that it
  494                  * depends mainly on the amount of sequential I/O and not
  495                  * much on the number of sequential I/O's.  The fixed size
  496                  * of 16384 is hard-coded here since it is (not quite) just
  497                  * a magic size that works well here.  This size is more
  498                  * closely related to the best I/O size for real disks than
  499                  * to any block size used by software.
  500                  */
  501                 fp->f_seqcount += howmany(uio->uio_resid, 16384);
  502                 if (fp->f_seqcount > IO_SEQMAX)
  503                         fp->f_seqcount = IO_SEQMAX;
  504                 return (fp->f_seqcount << IO_SEQSHIFT);
  505         }
  506 
  507         /* Not sequential.  Quickly draw-down sequentiality. */
  508         if (fp->f_seqcount > 1)
  509                 fp->f_seqcount = 1;
  510         else
  511                 fp->f_seqcount = 0;
  512         return (0);
  513 }
  514 
  515 /*
  516  * Package up an I/O request on a vnode into a uio and do it.
  517  */
  518 int
  519 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
  520     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
  521     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
  522 {
  523         struct uio auio;
  524         struct iovec aiov;
  525         struct mount *mp;
  526         struct ucred *cred;
  527         void *rl_cookie;
  528         struct vn_io_fault_args args;
  529         int error, lock_flags;
  530 
  531         if (offset < 0 && vp->v_type != VCHR)
  532                 return (EINVAL);
  533         auio.uio_iov = &aiov;
  534         auio.uio_iovcnt = 1;
  535         aiov.iov_base = base;
  536         aiov.iov_len = len;
  537         auio.uio_resid = len;
  538         auio.uio_offset = offset;
  539         auio.uio_segflg = segflg;
  540         auio.uio_rw = rw;
  541         auio.uio_td = td;
  542         error = 0;
  543 
  544         if ((ioflg & IO_NODELOCKED) == 0) {
  545                 if ((ioflg & IO_RANGELOCKED) == 0) {
  546                         if (rw == UIO_READ) {
  547                                 rl_cookie = vn_rangelock_rlock(vp, offset,
  548                                     offset + len);
  549                         } else {
  550                                 rl_cookie = vn_rangelock_wlock(vp, offset,
  551                                     offset + len);
  552                         }
  553                 } else
  554                         rl_cookie = NULL;
  555                 mp = NULL;
  556                 if (rw == UIO_WRITE) { 
  557                         if (vp->v_type != VCHR &&
  558                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
  559                             != 0)
  560                                 goto out;
  561                         if (MNT_SHARED_WRITES(mp) ||
  562                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
  563                                 lock_flags = LK_SHARED;
  564                         else
  565                                 lock_flags = LK_EXCLUSIVE;
  566                 } else
  567                         lock_flags = LK_SHARED;
  568                 vn_lock(vp, lock_flags | LK_RETRY);
  569         } else
  570                 rl_cookie = NULL;
  571 
  572         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
  573 #ifdef MAC
  574         if ((ioflg & IO_NOMACCHECK) == 0) {
  575                 if (rw == UIO_READ)
  576                         error = mac_vnode_check_read(active_cred, file_cred,
  577                             vp);
  578                 else
  579                         error = mac_vnode_check_write(active_cred, file_cred,
  580                             vp);
  581         }
  582 #endif
  583         if (error == 0) {
  584                 if (file_cred != NULL)
  585                         cred = file_cred;
  586                 else
  587                         cred = active_cred;
  588                 if (do_vn_io_fault(vp, &auio)) {
  589                         args.kind = VN_IO_FAULT_VOP;
  590                         args.cred = cred;
  591                         args.flags = ioflg;
  592                         args.args.vop_args.vp = vp;
  593                         error = vn_io_fault1(vp, &auio, &args, td);
  594                 } else if (rw == UIO_READ) {
  595                         error = VOP_READ(vp, &auio, ioflg, cred);
  596                 } else /* if (rw == UIO_WRITE) */ {
  597                         error = VOP_WRITE(vp, &auio, ioflg, cred);
  598                 }
  599         }
  600         if (aresid)
  601                 *aresid = auio.uio_resid;
  602         else
  603                 if (auio.uio_resid && error == 0)
  604                         error = EIO;
  605         if ((ioflg & IO_NODELOCKED) == 0) {
  606                 VOP_UNLOCK(vp, 0);
  607                 if (mp != NULL)
  608                         vn_finished_write(mp);
  609         }
  610  out:
  611         if (rl_cookie != NULL)
  612                 vn_rangelock_unlock(vp, rl_cookie);
  613         return (error);
  614 }
  615 
  616 /*
  617  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  618  * request is split up into smaller chunks and we try to avoid saturating
  619  * the buffer cache while potentially holding a vnode locked, so we 
  620  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  621  * to give other processes a chance to lock the vnode (either other processes
  622  * core'ing the same binary, or unrelated processes scanning the directory).
  623  */
  624 int
  625 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
  626     file_cred, aresid, td)
  627         enum uio_rw rw;
  628         struct vnode *vp;
  629         void *base;
  630         size_t len;
  631         off_t offset;
  632         enum uio_seg segflg;
  633         int ioflg;
  634         struct ucred *active_cred;
  635         struct ucred *file_cred;
  636         size_t *aresid;
  637         struct thread *td;
  638 {
  639         int error = 0;
  640         ssize_t iaresid;
  641 
  642         do {
  643                 int chunk;
  644 
  645                 /*
  646                  * Force `offset' to a multiple of MAXBSIZE except possibly
  647                  * for the first chunk, so that filesystems only need to
  648                  * write full blocks except possibly for the first and last
  649                  * chunks.
  650                  */
  651                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
  652 
  653                 if (chunk > len)
  654                         chunk = len;
  655                 if (rw != UIO_READ && vp->v_type == VREG)
  656                         bwillwrite();
  657                 iaresid = 0;
  658                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
  659                     ioflg, active_cred, file_cred, &iaresid, td);
  660                 len -= chunk;   /* aresid calc already includes length */
  661                 if (error)
  662                         break;
  663                 offset += chunk;
  664                 base = (char *)base + chunk;
  665                 kern_yield(PRI_USER);
  666         } while (len);
  667         if (aresid)
  668                 *aresid = len + iaresid;
  669         return (error);
  670 }
  671 
  672 off_t
  673 foffset_lock(struct file *fp, int flags)
  674 {
  675         struct mtx *mtxp;
  676         off_t res;
  677 
  678         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  679 
  680 #if OFF_MAX <= LONG_MAX
  681         /*
  682          * Caller only wants the current f_offset value.  Assume that
  683          * the long and shorter integer types reads are atomic.
  684          */
  685         if ((flags & FOF_NOLOCK) != 0)
  686                 return (fp->f_offset);
  687 #endif
  688 
  689         /*
  690          * According to McKusick the vn lock was protecting f_offset here.
  691          * It is now protected by the FOFFSET_LOCKED flag.
  692          */
  693         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  694         mtx_lock(mtxp);
  695         if ((flags & FOF_NOLOCK) == 0) {
  696                 while (fp->f_vnread_flags & FOFFSET_LOCKED) {
  697                         fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
  698                         msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
  699                             "vofflock", 0);
  700                 }
  701                 fp->f_vnread_flags |= FOFFSET_LOCKED;
  702         }
  703         res = fp->f_offset;
  704         mtx_unlock(mtxp);
  705         return (res);
  706 }
  707 
  708 void
  709 foffset_unlock(struct file *fp, off_t val, int flags)
  710 {
  711         struct mtx *mtxp;
  712 
  713         KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
  714 
  715 #if OFF_MAX <= LONG_MAX
  716         if ((flags & FOF_NOLOCK) != 0) {
  717                 if ((flags & FOF_NOUPDATE) == 0)
  718                         fp->f_offset = val;
  719                 if ((flags & FOF_NEXTOFF) != 0)
  720                         fp->f_nextoff = val;
  721                 return;
  722         }
  723 #endif
  724 
  725         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  726         mtx_lock(mtxp);
  727         if ((flags & FOF_NOUPDATE) == 0)
  728                 fp->f_offset = val;
  729         if ((flags & FOF_NEXTOFF) != 0)
  730                 fp->f_nextoff = val;
  731         if ((flags & FOF_NOLOCK) == 0) {
  732                 KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
  733                     ("Lost FOFFSET_LOCKED"));
  734                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
  735                         wakeup(&fp->f_vnread_flags);
  736                 fp->f_vnread_flags = 0;
  737         }
  738         mtx_unlock(mtxp);
  739 }
  740 
  741 void
  742 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
  743 {
  744 
  745         if ((flags & FOF_OFFSET) == 0)
  746                 uio->uio_offset = foffset_lock(fp, flags);
  747 }
  748 
  749 void
  750 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
  751 {
  752 
  753         if ((flags & FOF_OFFSET) == 0)
  754                 foffset_unlock(fp, uio->uio_offset, flags);
  755 }
  756 
  757 static int
  758 get_advice(struct file *fp, struct uio *uio)
  759 {
  760         struct mtx *mtxp;
  761         int ret;
  762 
  763         ret = POSIX_FADV_NORMAL;
  764         if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
  765                 return (ret);
  766 
  767         mtxp = mtx_pool_find(mtxpool_sleep, fp);
  768         mtx_lock(mtxp);
  769         if (fp->f_advice != NULL &&
  770             uio->uio_offset >= fp->f_advice->fa_start &&
  771             uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
  772                 ret = fp->f_advice->fa_advice;
  773         mtx_unlock(mtxp);
  774         return (ret);
  775 }
  776 
  777 /*
  778  * File table vnode read routine.
  779  */
  780 static int
  781 vn_read(fp, uio, active_cred, flags, td)
  782         struct file *fp;
  783         struct uio *uio;
  784         struct ucred *active_cred;
  785         int flags;
  786         struct thread *td;
  787 {
  788         struct vnode *vp;
  789         off_t orig_offset;
  790         int error, ioflag;
  791         int advice;
  792 
  793         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  794             uio->uio_td, td));
  795         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
  796         vp = fp->f_vnode;
  797         ioflag = 0;
  798         if (fp->f_flag & FNONBLOCK)
  799                 ioflag |= IO_NDELAY;
  800         if (fp->f_flag & O_DIRECT)
  801                 ioflag |= IO_DIRECT;
  802         advice = get_advice(fp, uio);
  803         vn_lock(vp, LK_SHARED | LK_RETRY);
  804 
  805         switch (advice) {
  806         case POSIX_FADV_NORMAL:
  807         case POSIX_FADV_SEQUENTIAL:
  808         case POSIX_FADV_NOREUSE:
  809                 ioflag |= sequential_heuristic(uio, fp);
  810                 break;
  811         case POSIX_FADV_RANDOM:
  812                 /* Disable read-ahead for random I/O. */
  813                 break;
  814         }
  815         orig_offset = uio->uio_offset;
  816 
  817 #ifdef MAC
  818         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
  819         if (error == 0)
  820 #endif
  821                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
  822         fp->f_nextoff = uio->uio_offset;
  823         VOP_UNLOCK(vp, 0);
  824         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
  825             orig_offset != uio->uio_offset)
  826                 /*
  827                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
  828                  * for the backing file after a POSIX_FADV_NOREUSE
  829                  * read(2).
  830                  */
  831                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
  832                     POSIX_FADV_DONTNEED);
  833         return (error);
  834 }
  835 
  836 /*
  837  * File table vnode write routine.
  838  */
  839 static int
  840 vn_write(fp, uio, active_cred, flags, td)
  841         struct file *fp;
  842         struct uio *uio;
  843         struct ucred *active_cred;
  844         int flags;
  845         struct thread *td;
  846 {
  847         struct vnode *vp;
  848         struct mount *mp;
  849         off_t orig_offset;
  850         int error, ioflag, lock_flags;
  851         int advice;
  852 
  853         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  854             uio->uio_td, td));
  855         KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
  856         vp = fp->f_vnode;
  857         if (vp->v_type == VREG)
  858                 bwillwrite();
  859         ioflag = IO_UNIT;
  860         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
  861                 ioflag |= IO_APPEND;
  862         if (fp->f_flag & FNONBLOCK)
  863                 ioflag |= IO_NDELAY;
  864         if (fp->f_flag & O_DIRECT)
  865                 ioflag |= IO_DIRECT;
  866         if ((fp->f_flag & O_FSYNC) ||
  867             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
  868                 ioflag |= IO_SYNC;
  869         mp = NULL;
  870         if (vp->v_type != VCHR &&
  871             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
  872                 goto unlock;
  873 
  874         advice = get_advice(fp, uio);
  875 
  876         if (MNT_SHARED_WRITES(mp) ||
  877             (mp == NULL && MNT_SHARED_WRITES(vp->v_mount))) {
  878                 lock_flags = LK_SHARED;
  879         } else {
  880                 lock_flags = LK_EXCLUSIVE;
  881         }
  882 
  883         vn_lock(vp, lock_flags | LK_RETRY);
  884         switch (advice) {
  885         case POSIX_FADV_NORMAL:
  886         case POSIX_FADV_SEQUENTIAL:
  887         case POSIX_FADV_NOREUSE:
  888                 ioflag |= sequential_heuristic(uio, fp);
  889                 break;
  890         case POSIX_FADV_RANDOM:
  891                 /* XXX: Is this correct? */
  892                 break;
  893         }
  894         orig_offset = uio->uio_offset;
  895 
  896 #ifdef MAC
  897         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
  898         if (error == 0)
  899 #endif
  900                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
  901         fp->f_nextoff = uio->uio_offset;
  902         VOP_UNLOCK(vp, 0);
  903         if (vp->v_type != VCHR)
  904                 vn_finished_write(mp);
  905         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
  906             orig_offset != uio->uio_offset)
  907                 /*
  908                  * Use POSIX_FADV_DONTNEED to flush pages and buffers
  909                  * for the backing file after a POSIX_FADV_NOREUSE
  910                  * write(2).
  911                  */
  912                 error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
  913                     POSIX_FADV_DONTNEED);
  914 unlock:
  915         return (error);
  916 }
  917 
  918 /*
  919  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  920  * prevent the following deadlock:
  921  *
  922  * Assume that the thread A reads from the vnode vp1 into userspace
  923  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  924  * currently not resident, then system ends up with the call chain
  925  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  926  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  927  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  928  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  929  * backed by the pages of vnode vp1, and some page in buf2 is not
  930  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  931  *
  932  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  933  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  934  * Instead, it first tries to do the whole range i/o with pagefaults
  935  * disabled. If all pages in the i/o buffer are resident and mapped,
  936  * VOP will succeed (ignoring the genuine filesystem errors).
  937  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  938  * i/o in chunks, with all pages in the chunk prefaulted and held
  939  * using vm_fault_quick_hold_pages().
  940  *
  941  * Filesystems using this deadlock avoidance scheme should use the
  942  * array of the held pages from uio, saved in the curthread->td_ma,
  943  * instead of doing uiomove().  A helper function
  944  * vn_io_fault_uiomove() converts uiomove request into
  945  * uiomove_fromphys() over td_ma array.
  946  *
  947  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  948  * make the current i/o request atomic with respect to other i/os and
  949  * truncations.
  950  */
  951 
  952 /*
  953  * Decode vn_io_fault_args and perform the corresponding i/o.
  954  */
  955 static int
  956 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
  957     struct thread *td)
  958 {
  959         int error, save;
  960 
  961         error = 0;
  962         save = vm_fault_disable_pagefaults();
  963         switch (args->kind) {
  964         case VN_IO_FAULT_FOP:
  965                 error = (args->args.fop_args.doio)(args->args.fop_args.fp,
  966                     uio, args->cred, args->flags, td);
  967                 break;
  968         case VN_IO_FAULT_VOP:
  969                 if (uio->uio_rw == UIO_READ) {
  970                         error = VOP_READ(args->args.vop_args.vp, uio,
  971                             args->flags, args->cred);
  972                 } else if (uio->uio_rw == UIO_WRITE) {
  973                         error = VOP_WRITE(args->args.vop_args.vp, uio,
  974                             args->flags, args->cred);
  975                 }
  976                 break;
  977         default:
  978                 panic("vn_io_fault_doio: unknown kind of io %d %d",
  979                     args->kind, uio->uio_rw);
  980         }
  981         vm_fault_enable_pagefaults(save);
  982         return (error);
  983 }
  984 
  985 static int
  986 vn_io_fault_touch(char *base, const struct uio *uio)
  987 {
  988         int r;
  989 
  990         r = fubyte(base);
  991         if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
  992                 return (EFAULT);
  993         return (0);
  994 }
  995 
  996 static int
  997 vn_io_fault_prefault_user(const struct uio *uio)
  998 {
  999         char *base;
 1000         const struct iovec *iov;
 1001         size_t len;
 1002         ssize_t resid;
 1003         int error, i;
 1004 
 1005         KASSERT(uio->uio_segflg == UIO_USERSPACE,
 1006             ("vn_io_fault_prefault userspace"));
 1007 
 1008         error = i = 0;
 1009         iov = uio->uio_iov;
 1010         resid = uio->uio_resid;
 1011         base = iov->iov_base;
 1012         len = iov->iov_len;
 1013         while (resid > 0) {
 1014                 error = vn_io_fault_touch(base, uio);
 1015                 if (error != 0)
 1016                         break;
 1017                 if (len < PAGE_SIZE) {
 1018                         if (len != 0) {
 1019                                 error = vn_io_fault_touch(base + len - 1, uio);
 1020                                 if (error != 0)
 1021                                         break;
 1022                                 resid -= len;
 1023                         }
 1024                         if (++i >= uio->uio_iovcnt)
 1025                                 break;
 1026                         iov = uio->uio_iov + i;
 1027                         base = iov->iov_base;
 1028                         len = iov->iov_len;
 1029                 } else {
 1030                         len -= PAGE_SIZE;
 1031                         base += PAGE_SIZE;
 1032                         resid -= PAGE_SIZE;
 1033                 }
 1034         }
 1035         return (error);
 1036 }
 1037 
 1038 /*
 1039  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
 1040  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
 1041  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
 1042  * into args and call vn_io_fault1() to handle faults during the user
 1043  * mode buffer accesses.
 1044  */
 1045 static int
 1046 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
 1047     struct thread *td)
 1048 {
 1049         vm_page_t ma[io_hold_cnt + 2];
 1050         struct uio *uio_clone, short_uio;
 1051         struct iovec short_iovec[1];
 1052         vm_page_t *prev_td_ma;
 1053         vm_prot_t prot;
 1054         vm_offset_t addr, end;
 1055         size_t len, resid;
 1056         ssize_t adv;
 1057         int error, cnt, saveheld, prev_td_ma_cnt;
 1058 
 1059         if (vn_io_fault_prefault) {
 1060                 error = vn_io_fault_prefault_user(uio);
 1061                 if (error != 0)
 1062                         return (error); /* Or ignore ? */
 1063         }
 1064 
 1065         prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 1066 
 1067         /*
 1068          * The UFS follows IO_UNIT directive and replays back both
 1069          * uio_offset and uio_resid if an error is encountered during the
 1070          * operation.  But, since the iovec may be already advanced,
 1071          * uio is still in an inconsistent state.
 1072          *
 1073          * Cache a copy of the original uio, which is advanced to the redo
 1074          * point using UIO_NOCOPY below.
 1075          */
 1076         uio_clone = cloneuio(uio);
 1077         resid = uio->uio_resid;
 1078 
 1079         short_uio.uio_segflg = UIO_USERSPACE;
 1080         short_uio.uio_rw = uio->uio_rw;
 1081         short_uio.uio_td = uio->uio_td;
 1082 
 1083         error = vn_io_fault_doio(args, uio, td);
 1084         if (error != EFAULT)
 1085                 goto out;
 1086 
 1087         atomic_add_long(&vn_io_faults_cnt, 1);
 1088         uio_clone->uio_segflg = UIO_NOCOPY;
 1089         uiomove(NULL, resid - uio->uio_resid, uio_clone);
 1090         uio_clone->uio_segflg = uio->uio_segflg;
 1091 
 1092         saveheld = curthread_pflags_set(TDP_UIOHELD);
 1093         prev_td_ma = td->td_ma;
 1094         prev_td_ma_cnt = td->td_ma_cnt;
 1095 
 1096         while (uio_clone->uio_resid != 0) {
 1097                 len = uio_clone->uio_iov->iov_len;
 1098                 if (len == 0) {
 1099                         KASSERT(uio_clone->uio_iovcnt >= 1,
 1100                             ("iovcnt underflow"));
 1101                         uio_clone->uio_iov++;
 1102                         uio_clone->uio_iovcnt--;
 1103                         continue;
 1104                 }
 1105                 if (len > io_hold_cnt * PAGE_SIZE)
 1106                         len = io_hold_cnt * PAGE_SIZE;
 1107                 addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 1108                 end = round_page(addr + len);
 1109                 if (end < addr) {
 1110                         error = EFAULT;
 1111                         break;
 1112                 }
 1113                 cnt = atop(end - trunc_page(addr));
 1114                 /*
 1115                  * A perfectly misaligned address and length could cause
 1116                  * both the start and the end of the chunk to use partial
 1117                  * page.  +2 accounts for such a situation.
 1118                  */
 1119                 cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 1120                     addr, len, prot, ma, io_hold_cnt + 2);
 1121                 if (cnt == -1) {
 1122                         error = EFAULT;
 1123                         break;
 1124                 }
 1125                 short_uio.uio_iov = &short_iovec[0];
 1126                 short_iovec[0].iov_base = (void *)addr;
 1127                 short_uio.uio_iovcnt = 1;
 1128                 short_uio.uio_resid = short_iovec[0].iov_len = len;
 1129                 short_uio.uio_offset = uio_clone->uio_offset;
 1130                 td->td_ma = ma;
 1131                 td->td_ma_cnt = cnt;
 1132 
 1133                 error = vn_io_fault_doio(args, &short_uio, td);
 1134                 vm_page_unhold_pages(ma, cnt);
 1135                 adv = len - short_uio.uio_resid;
 1136 
 1137                 uio_clone->uio_iov->iov_base =
 1138                     (char *)uio_clone->uio_iov->iov_base + adv;
 1139                 uio_clone->uio_iov->iov_len -= adv;
 1140                 uio_clone->uio_resid -= adv;
 1141                 uio_clone->uio_offset += adv;
 1142 
 1143                 uio->uio_resid -= adv;
 1144                 uio->uio_offset += adv;
 1145 
 1146                 if (error != 0 || adv == 0)
 1147                         break;
 1148         }
 1149         td->td_ma = prev_td_ma;
 1150         td->td_ma_cnt = prev_td_ma_cnt;
 1151         curthread_pflags_restore(saveheld);
 1152 out:
 1153         free(uio_clone, M_IOV);
 1154         return (error);
 1155 }
 1156 
 1157 static int
 1158 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
 1159     int flags, struct thread *td)
 1160 {
 1161         fo_rdwr_t *doio;
 1162         struct vnode *vp;
 1163         void *rl_cookie;
 1164         struct vn_io_fault_args args;
 1165         int error;
 1166 
 1167         doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 1168         vp = fp->f_vnode;
 1169         foffset_lock_uio(fp, uio, flags);
 1170         if (do_vn_io_fault(vp, uio)) {
 1171                 args.kind = VN_IO_FAULT_FOP;
 1172                 args.args.fop_args.fp = fp;
 1173                 args.args.fop_args.doio = doio;
 1174                 args.cred = active_cred;
 1175                 args.flags = flags | FOF_OFFSET;
 1176                 if (uio->uio_rw == UIO_READ) {
 1177                         rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 1178                             uio->uio_offset + uio->uio_resid);
 1179                 } else if ((fp->f_flag & O_APPEND) != 0 ||
 1180                     (flags & FOF_OFFSET) == 0) {
 1181                         /* For appenders, punt and lock the whole range. */
 1182                         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 1183                 } else {
 1184                         rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 1185                             uio->uio_offset + uio->uio_resid);
 1186                 }
 1187                 error = vn_io_fault1(vp, uio, &args, td);
 1188                 vn_rangelock_unlock(vp, rl_cookie);
 1189         } else {
 1190                 error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 1191         }
 1192         foffset_unlock_uio(fp, uio, flags);
 1193         return (error);
 1194 }
 1195 
 1196 /*
 1197  * Helper function to perform the requested uiomove operation using
 1198  * the held pages for io->uio_iov[0].iov_base buffer instead of
 1199  * copyin/copyout.  Access to the pages with uiomove_fromphys()
 1200  * instead of iov_base prevents page faults that could occur due to
 1201  * pmap_collect() invalidating the mapping created by
 1202  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
 1203  * object cleanup revoking the write access from page mappings.
 1204  *
 1205  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
 1206  * instead of plain uiomove().
 1207  */
 1208 int
 1209 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 1210 {
 1211         struct uio transp_uio;
 1212         struct iovec transp_iov[1];
 1213         struct thread *td;
 1214         size_t adv;
 1215         int error, pgadv;
 1216 
 1217         td = curthread;
 1218         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 1219             uio->uio_segflg != UIO_USERSPACE)
 1220                 return (uiomove(data, xfersize, uio));
 1221 
 1222         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 1223         transp_iov[0].iov_base = data;
 1224         transp_uio.uio_iov = &transp_iov[0];
 1225         transp_uio.uio_iovcnt = 1;
 1226         if (xfersize > uio->uio_resid)
 1227                 xfersize = uio->uio_resid;
 1228         transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 1229         transp_uio.uio_offset = 0;
 1230         transp_uio.uio_segflg = UIO_SYSSPACE;
 1231         /*
 1232          * Since transp_iov points to data, and td_ma page array
 1233          * corresponds to original uio->uio_iov, we need to invert the
 1234          * direction of the i/o operation as passed to
 1235          * uiomove_fromphys().
 1236          */
 1237         switch (uio->uio_rw) {
 1238         case UIO_WRITE:
 1239                 transp_uio.uio_rw = UIO_READ;
 1240                 break;
 1241         case UIO_READ:
 1242                 transp_uio.uio_rw = UIO_WRITE;
 1243                 break;
 1244         }
 1245         transp_uio.uio_td = uio->uio_td;
 1246         error = uiomove_fromphys(td->td_ma,
 1247             ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 1248             xfersize, &transp_uio);
 1249         adv = xfersize - transp_uio.uio_resid;
 1250         pgadv =
 1251             (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 1252             (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 1253         td->td_ma += pgadv;
 1254         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 1255             pgadv));
 1256         td->td_ma_cnt -= pgadv;
 1257         uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 1258         uio->uio_iov->iov_len -= adv;
 1259         uio->uio_resid -= adv;
 1260         uio->uio_offset += adv;
 1261         return (error);
 1262 }
 1263 
 1264 int
 1265 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 1266     struct uio *uio)
 1267 {
 1268         struct thread *td;
 1269         vm_offset_t iov_base;
 1270         int cnt, pgadv;
 1271 
 1272         td = curthread;
 1273         if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 1274             uio->uio_segflg != UIO_USERSPACE)
 1275                 return (uiomove_fromphys(ma, offset, xfersize, uio));
 1276 
 1277         KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 1278         cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 1279         iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 1280         switch (uio->uio_rw) {
 1281         case UIO_WRITE:
 1282                 pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 1283                     offset, cnt);
 1284                 break;
 1285         case UIO_READ:
 1286                 pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 1287                     cnt);
 1288                 break;
 1289         }
 1290         pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 1291         td->td_ma += pgadv;
 1292         KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 1293             pgadv));
 1294         td->td_ma_cnt -= pgadv;
 1295         uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 1296         uio->uio_iov->iov_len -= cnt;
 1297         uio->uio_resid -= cnt;
 1298         uio->uio_offset += cnt;
 1299         return (0);
 1300 }
 1301 
 1302 
 1303 /*
 1304  * File table truncate routine.
 1305  */
 1306 static int
 1307 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
 1308     struct thread *td)
 1309 {
 1310         struct vattr vattr;
 1311         struct mount *mp;
 1312         struct vnode *vp;
 1313         void *rl_cookie;
 1314         int error;
 1315 
 1316         vp = fp->f_vnode;
 1317 
 1318         /*
 1319          * Lock the whole range for truncation.  Otherwise split i/o
 1320          * might happen partly before and partly after the truncation.
 1321          */
 1322         rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 1323         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 1324         if (error)
 1325                 goto out1;
 1326         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1327         if (vp->v_type == VDIR) {
 1328                 error = EISDIR;
 1329                 goto out;
 1330         }
 1331 #ifdef MAC
 1332         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 1333         if (error)
 1334                 goto out;
 1335 #endif
 1336         error = vn_writechk(vp);
 1337         if (error == 0) {
 1338                 VATTR_NULL(&vattr);
 1339                 vattr.va_size = length;
 1340                 if ((fp->f_flag & O_FSYNC) != 0)
 1341                         vattr.va_vaflags |= VA_SYNC;
 1342                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
 1343         }
 1344 out:
 1345         VOP_UNLOCK(vp, 0);
 1346         vn_finished_write(mp);
 1347 out1:
 1348         vn_rangelock_unlock(vp, rl_cookie);
 1349         return (error);
 1350 }
 1351 
 1352 /*
 1353  * File table vnode stat routine.
 1354  */
 1355 static int
 1356 vn_statfile(fp, sb, active_cred, td)
 1357         struct file *fp;
 1358         struct stat *sb;
 1359         struct ucred *active_cred;
 1360         struct thread *td;
 1361 {
 1362         struct vnode *vp = fp->f_vnode;
 1363         int error;
 1364 
 1365         vn_lock(vp, LK_SHARED | LK_RETRY);
 1366         error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
 1367         VOP_UNLOCK(vp, 0);
 1368 
 1369         return (error);
 1370 }
 1371 
 1372 /*
 1373  * Stat a vnode; implementation for the stat syscall
 1374  */
 1375 int
 1376 vn_stat(struct vnode *vp, struct stat *sb, struct ucred *active_cred,
 1377     struct ucred *file_cred, struct thread *td)
 1378 {
 1379         struct vattr vattr;
 1380         struct vattr *vap;
 1381         int error;
 1382         u_short mode;
 1383 
 1384         AUDIT_ARG_VNODE1(vp);
 1385 #ifdef MAC
 1386         error = mac_vnode_check_stat(active_cred, file_cred, vp);
 1387         if (error)
 1388                 return (error);
 1389 #endif
 1390 
 1391         vap = &vattr;
 1392 
 1393         /*
 1394          * Initialize defaults for new and unusual fields, so that file
 1395          * systems which don't support these fields don't need to know
 1396          * about them.
 1397          */
 1398         vap->va_birthtime.tv_sec = -1;
 1399         vap->va_birthtime.tv_nsec = 0;
 1400         vap->va_fsid = VNOVAL;
 1401         vap->va_rdev = NODEV;
 1402 
 1403         error = VOP_GETATTR(vp, vap, active_cred);
 1404         if (error)
 1405                 return (error);
 1406 
 1407         /*
 1408          * Zero the spare stat fields
 1409          */
 1410         bzero(sb, sizeof *sb);
 1411 
 1412         /*
 1413          * Copy from vattr table
 1414          */
 1415         if (vap->va_fsid != VNOVAL)
 1416                 sb->st_dev = vap->va_fsid;
 1417         else
 1418                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 1419         sb->st_ino = vap->va_fileid;
 1420         mode = vap->va_mode;
 1421         switch (vap->va_type) {
 1422         case VREG:
 1423                 mode |= S_IFREG;
 1424                 break;
 1425         case VDIR:
 1426                 mode |= S_IFDIR;
 1427                 break;
 1428         case VBLK:
 1429                 mode |= S_IFBLK;
 1430                 break;
 1431         case VCHR:
 1432                 mode |= S_IFCHR;
 1433                 break;
 1434         case VLNK:
 1435                 mode |= S_IFLNK;
 1436                 break;
 1437         case VSOCK:
 1438                 mode |= S_IFSOCK;
 1439                 break;
 1440         case VFIFO:
 1441                 mode |= S_IFIFO;
 1442                 break;
 1443         default:
 1444                 return (EBADF);
 1445         }
 1446         sb->st_mode = mode;
 1447         sb->st_nlink = vap->va_nlink;
 1448         sb->st_uid = vap->va_uid;
 1449         sb->st_gid = vap->va_gid;
 1450         sb->st_rdev = vap->va_rdev;
 1451         if (vap->va_size > OFF_MAX)
 1452                 return (EOVERFLOW);
 1453         sb->st_size = vap->va_size;
 1454         sb->st_atim = vap->va_atime;
 1455         sb->st_mtim = vap->va_mtime;
 1456         sb->st_ctim = vap->va_ctime;
 1457         sb->st_birthtim = vap->va_birthtime;
 1458 
 1459         /*
 1460          * According to www.opengroup.org, the meaning of st_blksize is 
 1461          *   "a filesystem-specific preferred I/O block size for this 
 1462          *    object.  In some filesystem types, this may vary from file
 1463          *    to file"
 1464          * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
 1465          */
 1466 
 1467         sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
 1468         
 1469         sb->st_flags = vap->va_flags;
 1470         if (priv_check(td, PRIV_VFS_GENERATION))
 1471                 sb->st_gen = 0;
 1472         else
 1473                 sb->st_gen = vap->va_gen;
 1474 
 1475         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 1476         return (0);
 1477 }
 1478 
 1479 /*
 1480  * File table vnode ioctl routine.
 1481  */
 1482 static int
 1483 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
 1484     struct thread *td)
 1485 {
 1486         struct vattr vattr;
 1487         struct vnode *vp;
 1488         int error;
 1489 
 1490         vp = fp->f_vnode;
 1491         switch (vp->v_type) {
 1492         case VDIR:
 1493         case VREG:
 1494                 switch (com) {
 1495                 case FIONREAD:
 1496                         vn_lock(vp, LK_SHARED | LK_RETRY);
 1497                         error = VOP_GETATTR(vp, &vattr, active_cred);
 1498                         VOP_UNLOCK(vp, 0);
 1499                         if (error == 0)
 1500                                 *(int *)data = vattr.va_size - fp->f_offset;
 1501                         return (error);
 1502                 case FIONBIO:
 1503                 case FIOASYNC:
 1504                         return (0);
 1505                 default:
 1506                         return (VOP_IOCTL(vp, com, data, fp->f_flag,
 1507                             active_cred, td));
 1508                 }
 1509         default:
 1510                 return (ENOTTY);
 1511         }
 1512 }
 1513 
 1514 /*
 1515  * File table vnode poll routine.
 1516  */
 1517 static int
 1518 vn_poll(struct file *fp, int events, struct ucred *active_cred,
 1519     struct thread *td)
 1520 {
 1521         struct vnode *vp;
 1522         int error;
 1523 
 1524         vp = fp->f_vnode;
 1525 #ifdef MAC
 1526         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1527         AUDIT_ARG_VNODE1(vp);
 1528         error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 1529         VOP_UNLOCK(vp, 0);
 1530         if (!error)
 1531 #endif
 1532 
 1533         error = VOP_POLL(vp, events, fp->f_cred, td);
 1534         return (error);
 1535 }
 1536 
 1537 /*
 1538  * Acquire the requested lock and then check for validity.  LK_RETRY
 1539  * permits vn_lock to return doomed vnodes.
 1540  */
 1541 int
 1542 _vn_lock(struct vnode *vp, int flags, char *file, int line)
 1543 {
 1544         int error;
 1545 
 1546         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 1547             ("vn_lock: no locktype"));
 1548         VNASSERT(vp->v_holdcnt != 0, vp, ("vn_lock: zero hold count"));
 1549 retry:
 1550         error = VOP_LOCK1(vp, flags, file, line);
 1551         flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
 1552         KASSERT((flags & LK_RETRY) == 0 || error == 0,
 1553             ("vn_lock: error %d incompatible with flags %#x", error, flags));
 1554 
 1555         if ((flags & LK_RETRY) == 0) {
 1556                 if (error == 0 && (vp->v_iflag & VI_DOOMED) != 0) {
 1557                         VOP_UNLOCK(vp, 0);
 1558                         error = ENOENT;
 1559                 }
 1560         } else if (error != 0)
 1561                 goto retry;
 1562         return (error);
 1563 }
 1564 
 1565 /*
 1566  * File table vnode close routine.
 1567  */
 1568 static int
 1569 vn_closefile(struct file *fp, struct thread *td)
 1570 {
 1571         struct vnode *vp;
 1572         struct flock lf;
 1573         int error;
 1574         bool ref;
 1575 
 1576         vp = fp->f_vnode;
 1577         fp->f_ops = &badfileops;
 1578         ref= (fp->f_flag & FHASLOCK) != 0 && fp->f_type == DTYPE_VNODE;
 1579 
 1580         error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 1581 
 1582         if (__predict_false(ref)) {
 1583                 lf.l_whence = SEEK_SET;
 1584                 lf.l_start = 0;
 1585                 lf.l_len = 0;
 1586                 lf.l_type = F_UNLCK;
 1587                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 1588                 vrele(vp);
 1589         }
 1590         return (error);
 1591 }
 1592 
 1593 static bool
 1594 vn_suspendable(struct mount *mp)
 1595 {
 1596 
 1597         return (mp->mnt_op->vfs_susp_clean != NULL);
 1598 }
 1599 
 1600 /*
 1601  * Preparing to start a filesystem write operation. If the operation is
 1602  * permitted, then we bump the count of operations in progress and
 1603  * proceed. If a suspend request is in progress, we wait until the
 1604  * suspension is over, and then proceed.
 1605  */
 1606 static int
 1607 vn_start_write_locked(struct mount *mp, int flags)
 1608 {
 1609         int error, mflags;
 1610 
 1611         mtx_assert(MNT_MTX(mp), MA_OWNED);
 1612         error = 0;
 1613 
 1614         /*
 1615          * Check on status of suspension.
 1616          */
 1617         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 1618             mp->mnt_susp_owner != curthread) {
 1619                 mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
 1620                     (flags & PCATCH) : 0) | (PUSER - 1);
 1621                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 1622                         if (flags & V_NOWAIT) {
 1623                                 error = EWOULDBLOCK;
 1624                                 goto unlock;
 1625                         }
 1626                         error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 1627                             "suspfs", 0);
 1628                         if (error)
 1629                                 goto unlock;
 1630                 }
 1631         }
 1632         if (flags & V_XSLEEP)
 1633                 goto unlock;
 1634         mp->mnt_writeopcount++;
 1635 unlock:
 1636         if (error != 0 || (flags & V_XSLEEP) != 0)
 1637                 MNT_REL(mp);
 1638         MNT_IUNLOCK(mp);
 1639         return (error);
 1640 }
 1641 
 1642 int
 1643 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 1644 {
 1645         struct mount *mp;
 1646         int error;
 1647 
 1648         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 1649             ("V_MNTREF requires mp"));
 1650 
 1651         error = 0;
 1652         /*
 1653          * If a vnode is provided, get and return the mount point that
 1654          * to which it will write.
 1655          */
 1656         if (vp != NULL) {
 1657                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1658                         *mpp = NULL;
 1659                         if (error != EOPNOTSUPP)
 1660                                 return (error);
 1661                         return (0);
 1662                 }
 1663         }
 1664         if ((mp = *mpp) == NULL)
 1665                 return (0);
 1666 
 1667         if (!vn_suspendable(mp)) {
 1668                 if (vp != NULL || (flags & V_MNTREF) != 0)
 1669                         vfs_rel(mp);
 1670                 return (0);
 1671         }
 1672 
 1673         /*
 1674          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1675          * a vfs_ref().
 1676          * As long as a vnode is not provided we need to acquire a
 1677          * refcount for the provided mountpoint too, in order to
 1678          * emulate a vfs_ref().
 1679          */
 1680         MNT_ILOCK(mp);
 1681         if (vp == NULL && (flags & V_MNTREF) == 0)
 1682                 MNT_REF(mp);
 1683 
 1684         return (vn_start_write_locked(mp, flags));
 1685 }
 1686 
 1687 /*
 1688  * Secondary suspension. Used by operations such as vop_inactive
 1689  * routines that are needed by the higher level functions. These
 1690  * are allowed to proceed until all the higher level functions have
 1691  * completed (indicated by mnt_writeopcount dropping to zero). At that
 1692  * time, these operations are halted until the suspension is over.
 1693  */
 1694 int
 1695 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 1696 {
 1697         struct mount *mp;
 1698         int error;
 1699 
 1700         KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 1701             ("V_MNTREF requires mp"));
 1702 
 1703  retry:
 1704         if (vp != NULL) {
 1705                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1706                         *mpp = NULL;
 1707                         if (error != EOPNOTSUPP)
 1708                                 return (error);
 1709                         return (0);
 1710                 }
 1711         }
 1712         /*
 1713          * If we are not suspended or have not yet reached suspended
 1714          * mode, then let the operation proceed.
 1715          */
 1716         if ((mp = *mpp) == NULL)
 1717                 return (0);
 1718 
 1719         if (!vn_suspendable(mp)) {
 1720                 if (vp != NULL || (flags & V_MNTREF) != 0)
 1721                         vfs_rel(mp);
 1722                 return (0);
 1723         }
 1724 
 1725         /*
 1726          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1727          * a vfs_ref().
 1728          * As long as a vnode is not provided we need to acquire a
 1729          * refcount for the provided mountpoint too, in order to
 1730          * emulate a vfs_ref().
 1731          */
 1732         MNT_ILOCK(mp);
 1733         if (vp == NULL && (flags & V_MNTREF) == 0)
 1734                 MNT_REF(mp);
 1735         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 1736                 mp->mnt_secondary_writes++;
 1737                 mp->mnt_secondary_accwrites++;
 1738                 MNT_IUNLOCK(mp);
 1739                 return (0);
 1740         }
 1741         if (flags & V_NOWAIT) {
 1742                 MNT_REL(mp);
 1743                 MNT_IUNLOCK(mp);
 1744                 return (EWOULDBLOCK);
 1745         }
 1746         /*
 1747          * Wait for the suspension to finish.
 1748          */
 1749         error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
 1750             ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
 1751             "suspfs", 0);
 1752         vfs_rel(mp);
 1753         if (error == 0)
 1754                 goto retry;
 1755         return (error);
 1756 }
 1757 
 1758 /*
 1759  * Filesystem write operation has completed. If we are suspending and this
 1760  * operation is the last one, notify the suspender that the suspension is
 1761  * now in effect.
 1762  */
 1763 void
 1764 vn_finished_write(struct mount *mp)
 1765 {
 1766         if (mp == NULL || !vn_suspendable(mp))
 1767                 return;
 1768         MNT_ILOCK(mp);
 1769         MNT_REL(mp);
 1770         mp->mnt_writeopcount--;
 1771         if (mp->mnt_writeopcount < 0)
 1772                 panic("vn_finished_write: neg cnt");
 1773         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 1774             mp->mnt_writeopcount <= 0)
 1775                 wakeup(&mp->mnt_writeopcount);
 1776         MNT_IUNLOCK(mp);
 1777 }
 1778 
 1779 
 1780 /*
 1781  * Filesystem secondary write operation has completed. If we are
 1782  * suspending and this operation is the last one, notify the suspender
 1783  * that the suspension is now in effect.
 1784  */
 1785 void
 1786 vn_finished_secondary_write(struct mount *mp)
 1787 {
 1788         if (mp == NULL || !vn_suspendable(mp))
 1789                 return;
 1790         MNT_ILOCK(mp);
 1791         MNT_REL(mp);
 1792         mp->mnt_secondary_writes--;
 1793         if (mp->mnt_secondary_writes < 0)
 1794                 panic("vn_finished_secondary_write: neg cnt");
 1795         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 1796             mp->mnt_secondary_writes <= 0)
 1797                 wakeup(&mp->mnt_secondary_writes);
 1798         MNT_IUNLOCK(mp);
 1799 }
 1800 
 1801 
 1802 
 1803 /*
 1804  * Request a filesystem to suspend write operations.
 1805  */
 1806 int
 1807 vfs_write_suspend(struct mount *mp, int flags)
 1808 {
 1809         int error;
 1810 
 1811         MPASS(vn_suspendable(mp));
 1812 
 1813         MNT_ILOCK(mp);
 1814         if (mp->mnt_susp_owner == curthread) {
 1815                 MNT_IUNLOCK(mp);
 1816                 return (EALREADY);
 1817         }
 1818         while (mp->mnt_kern_flag & MNTK_SUSPEND)
 1819                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 1820 
 1821         /*
 1822          * Unmount holds a write reference on the mount point.  If we
 1823          * own busy reference and drain for writers, we deadlock with
 1824          * the reference draining in the unmount path.  Callers of
 1825          * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 1826          * vfs_busy() reference is owned and caller is not in the
 1827          * unmount context.
 1828          */
 1829         if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 1830             (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 1831                 MNT_IUNLOCK(mp);
 1832                 return (EBUSY);
 1833         }
 1834 
 1835         mp->mnt_kern_flag |= MNTK_SUSPEND;
 1836         mp->mnt_susp_owner = curthread;
 1837         if (mp->mnt_writeopcount > 0)
 1838                 (void) msleep(&mp->mnt_writeopcount, 
 1839                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 1840         else
 1841                 MNT_IUNLOCK(mp);
 1842         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
 1843                 vfs_write_resume(mp, 0);
 1844         return (error);
 1845 }
 1846 
 1847 /*
 1848  * Request a filesystem to resume write operations.
 1849  */
 1850 void
 1851 vfs_write_resume(struct mount *mp, int flags)
 1852 {
 1853 
 1854         MPASS(vn_suspendable(mp));
 1855 
 1856         MNT_ILOCK(mp);
 1857         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 1858                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 1859                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 1860                                        MNTK_SUSPENDED);
 1861                 mp->mnt_susp_owner = NULL;
 1862                 wakeup(&mp->mnt_writeopcount);
 1863                 wakeup(&mp->mnt_flag);
 1864                 curthread->td_pflags &= ~TDP_IGNSUSP;
 1865                 if ((flags & VR_START_WRITE) != 0) {
 1866                         MNT_REF(mp);
 1867                         mp->mnt_writeopcount++;
 1868                 }
 1869                 MNT_IUNLOCK(mp);
 1870                 if ((flags & VR_NO_SUSPCLR) == 0)
 1871                         VFS_SUSP_CLEAN(mp);
 1872         } else if ((flags & VR_START_WRITE) != 0) {
 1873                 MNT_REF(mp);
 1874                 vn_start_write_locked(mp, 0);
 1875         } else {
 1876                 MNT_IUNLOCK(mp);
 1877         }
 1878 }
 1879 
 1880 /*
 1881  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
 1882  * methods.
 1883  */
 1884 int
 1885 vfs_write_suspend_umnt(struct mount *mp)
 1886 {
 1887         int error;
 1888 
 1889         MPASS(vn_suspendable(mp));
 1890         KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 1891             ("vfs_write_suspend_umnt: recursed"));
 1892 
 1893         /* dounmount() already called vn_start_write(). */
 1894         for (;;) {
 1895                 vn_finished_write(mp);
 1896                 error = vfs_write_suspend(mp, 0);
 1897                 if (error != 0) {
 1898                         vn_start_write(NULL, &mp, V_WAIT);
 1899                         return (error);
 1900                 }
 1901                 MNT_ILOCK(mp);
 1902                 if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 1903                         break;
 1904                 MNT_IUNLOCK(mp);
 1905                 vn_start_write(NULL, &mp, V_WAIT);
 1906         }
 1907         mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 1908         wakeup(&mp->mnt_flag);
 1909         MNT_IUNLOCK(mp);
 1910         curthread->td_pflags |= TDP_IGNSUSP;
 1911         return (0);
 1912 }
 1913 
 1914 /*
 1915  * Implement kqueues for files by translating it to vnode operation.
 1916  */
 1917 static int
 1918 vn_kqfilter(struct file *fp, struct knote *kn)
 1919 {
 1920 
 1921         return (VOP_KQFILTER(fp->f_vnode, kn));
 1922 }
 1923 
 1924 /*
 1925  * Simplified in-kernel wrapper calls for extended attribute access.
 1926  * Both calls pass in a NULL credential, authorizing as "kernel" access.
 1927  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 1928  */
 1929 int
 1930 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 1931     const char *attrname, int *buflen, char *buf, struct thread *td)
 1932 {
 1933         struct uio      auio;
 1934         struct iovec    iov;
 1935         int     error;
 1936 
 1937         iov.iov_len = *buflen;
 1938         iov.iov_base = buf;
 1939 
 1940         auio.uio_iov = &iov;
 1941         auio.uio_iovcnt = 1;
 1942         auio.uio_rw = UIO_READ;
 1943         auio.uio_segflg = UIO_SYSSPACE;
 1944         auio.uio_td = td;
 1945         auio.uio_offset = 0;
 1946         auio.uio_resid = *buflen;
 1947 
 1948         if ((ioflg & IO_NODELOCKED) == 0)
 1949                 vn_lock(vp, LK_SHARED | LK_RETRY);
 1950 
 1951         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1952 
 1953         /* authorize attribute retrieval as kernel */
 1954         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 1955             td);
 1956 
 1957         if ((ioflg & IO_NODELOCKED) == 0)
 1958                 VOP_UNLOCK(vp, 0);
 1959 
 1960         if (error == 0) {
 1961                 *buflen = *buflen - auio.uio_resid;
 1962         }
 1963 
 1964         return (error);
 1965 }
 1966 
 1967 /*
 1968  * XXX failure mode if partially written?
 1969  */
 1970 int
 1971 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 1972     const char *attrname, int buflen, char *buf, struct thread *td)
 1973 {
 1974         struct uio      auio;
 1975         struct iovec    iov;
 1976         struct mount    *mp;
 1977         int     error;
 1978 
 1979         iov.iov_len = buflen;
 1980         iov.iov_base = buf;
 1981 
 1982         auio.uio_iov = &iov;
 1983         auio.uio_iovcnt = 1;
 1984         auio.uio_rw = UIO_WRITE;
 1985         auio.uio_segflg = UIO_SYSSPACE;
 1986         auio.uio_td = td;
 1987         auio.uio_offset = 0;
 1988         auio.uio_resid = buflen;
 1989 
 1990         if ((ioflg & IO_NODELOCKED) == 0) {
 1991                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 1992                         return (error);
 1993                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1994         }
 1995 
 1996         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1997 
 1998         /* authorize attribute setting as kernel */
 1999         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 2000 
 2001         if ((ioflg & IO_NODELOCKED) == 0) {
 2002                 vn_finished_write(mp);
 2003                 VOP_UNLOCK(vp, 0);
 2004         }
 2005 
 2006         return (error);
 2007 }
 2008 
 2009 int
 2010 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 2011     const char *attrname, struct thread *td)
 2012 {
 2013         struct mount    *mp;
 2014         int     error;
 2015 
 2016         if ((ioflg & IO_NODELOCKED) == 0) {
 2017                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 2018                         return (error);
 2019                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 2020         }
 2021 
 2022         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 2023 
 2024         /* authorize attribute removal as kernel */
 2025         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 2026         if (error == EOPNOTSUPP)
 2027                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 2028                     NULL, td);
 2029 
 2030         if ((ioflg & IO_NODELOCKED) == 0) {
 2031                 vn_finished_write(mp);
 2032                 VOP_UNLOCK(vp, 0);
 2033         }
 2034 
 2035         return (error);
 2036 }
 2037 
 2038 static int
 2039 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
 2040     struct vnode **rvp)
 2041 {
 2042 
 2043         return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 2044 }
 2045 
 2046 int
 2047 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 2048 {
 2049 
 2050         return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 2051             lkflags, rvp));
 2052 }
 2053 
 2054 int
 2055 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
 2056     int lkflags, struct vnode **rvp)
 2057 {
 2058         struct mount *mp;
 2059         int ltype, error;
 2060 
 2061         ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 2062         mp = vp->v_mount;
 2063         ltype = VOP_ISLOCKED(vp);
 2064         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 2065             ("vn_vget_ino: vp not locked"));
 2066         error = vfs_busy(mp, MBF_NOWAIT);
 2067         if (error != 0) {
 2068                 vfs_ref(mp);
 2069                 VOP_UNLOCK(vp, 0);
 2070                 error = vfs_busy(mp, 0);
 2071                 vn_lock(vp, ltype | LK_RETRY);
 2072                 vfs_rel(mp);
 2073                 if (error != 0)
 2074                         return (ENOENT);
 2075                 if (vp->v_iflag & VI_DOOMED) {
 2076                         vfs_unbusy(mp);
 2077                         return (ENOENT);
 2078                 }
 2079         }
 2080         VOP_UNLOCK(vp, 0);
 2081         error = alloc(mp, alloc_arg, lkflags, rvp);
 2082         vfs_unbusy(mp);
 2083         if (*rvp != vp)
 2084                 vn_lock(vp, ltype | LK_RETRY);
 2085         if (vp->v_iflag & VI_DOOMED) {
 2086                 if (error == 0) {
 2087                         if (*rvp == vp)
 2088                                 vunref(vp);
 2089                         else
 2090                                 vput(*rvp);
 2091                 }
 2092                 error = ENOENT;
 2093         }
 2094         return (error);
 2095 }
 2096 
 2097 int
 2098 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
 2099     struct thread *td)
 2100 {
 2101 
 2102         if (vp->v_type != VREG || td == NULL)
 2103                 return (0);
 2104         if ((uoff_t)uio->uio_offset + uio->uio_resid >
 2105             lim_cur(td, RLIMIT_FSIZE)) {
 2106                 PROC_LOCK(td->td_proc);
 2107                 kern_psignal(td->td_proc, SIGXFSZ);
 2108                 PROC_UNLOCK(td->td_proc);
 2109                 return (EFBIG);
 2110         }
 2111         return (0);
 2112 }
 2113 
 2114 int
 2115 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
 2116     struct thread *td)
 2117 {
 2118         struct vnode *vp;
 2119 
 2120         vp = fp->f_vnode;
 2121 #ifdef AUDIT
 2122         vn_lock(vp, LK_SHARED | LK_RETRY);
 2123         AUDIT_ARG_VNODE1(vp);
 2124         VOP_UNLOCK(vp, 0);
 2125 #endif
 2126         return (setfmode(td, active_cred, vp, mode));
 2127 }
 2128 
 2129 int
 2130 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
 2131     struct thread *td)
 2132 {
 2133         struct vnode *vp;
 2134 
 2135         vp = fp->f_vnode;
 2136 #ifdef AUDIT
 2137         vn_lock(vp, LK_SHARED | LK_RETRY);
 2138         AUDIT_ARG_VNODE1(vp);
 2139         VOP_UNLOCK(vp, 0);
 2140 #endif
 2141         return (setfown(td, active_cred, vp, uid, gid));
 2142 }
 2143 
 2144 void
 2145 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 2146 {
 2147         vm_object_t object;
 2148 
 2149         if ((object = vp->v_object) == NULL)
 2150                 return;
 2151         VM_OBJECT_WLOCK(object);
 2152         vm_object_page_remove(object, start, end, 0);
 2153         VM_OBJECT_WUNLOCK(object);
 2154 }
 2155 
 2156 int
 2157 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 2158 {
 2159         struct vattr va;
 2160         daddr_t bn, bnp;
 2161         uint64_t bsize;
 2162         off_t noff;
 2163         int error;
 2164 
 2165         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 2166             ("Wrong command %lu", cmd));
 2167 
 2168         if (vn_lock(vp, LK_SHARED) != 0)
 2169                 return (EBADF);
 2170         if (vp->v_type != VREG) {
 2171                 error = ENOTTY;
 2172                 goto unlock;
 2173         }
 2174         error = VOP_GETATTR(vp, &va, cred);
 2175         if (error != 0)
 2176                 goto unlock;
 2177         noff = *off;
 2178         if (noff >= va.va_size) {
 2179                 error = ENXIO;
 2180                 goto unlock;
 2181         }
 2182         bsize = vp->v_mount->mnt_stat.f_iosize;
 2183         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
 2184                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 2185                 if (error == EOPNOTSUPP) {
 2186                         error = ENOTTY;
 2187                         goto unlock;
 2188                 }
 2189                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 2190                     (bnp != -1 && cmd == FIOSEEKDATA)) {
 2191                         noff = bn * bsize;
 2192                         if (noff < *off)
 2193                                 noff = *off;
 2194                         goto unlock;
 2195                 }
 2196         }
 2197         if (noff > va.va_size)
 2198                 noff = va.va_size;
 2199         /* noff == va.va_size. There is an implicit hole at the end of file. */
 2200         if (cmd == FIOSEEKDATA)
 2201                 error = ENXIO;
 2202 unlock:
 2203         VOP_UNLOCK(vp, 0);
 2204         if (error == 0)
 2205                 *off = noff;
 2206         return (error);
 2207 }
 2208 
 2209 int
 2210 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 2211 {
 2212         struct ucred *cred;
 2213         struct vnode *vp;
 2214         struct vattr vattr;
 2215         off_t foffset, size;
 2216         int error, noneg;
 2217 
 2218         cred = td->td_ucred;
 2219         vp = fp->f_vnode;
 2220         foffset = foffset_lock(fp, 0);
 2221         noneg = (vp->v_type != VCHR);
 2222         error = 0;
 2223         switch (whence) {
 2224         case L_INCR:
 2225                 if (noneg &&
 2226                     (foffset < 0 ||
 2227                     (offset > 0 && foffset > OFF_MAX - offset))) {
 2228                         error = EOVERFLOW;
 2229                         break;
 2230                 }
 2231                 offset += foffset;
 2232                 break;
 2233         case L_XTND:
 2234                 vn_lock(vp, LK_SHARED | LK_RETRY);
 2235                 error = VOP_GETATTR(vp, &vattr, cred);
 2236                 VOP_UNLOCK(vp, 0);
 2237                 if (error)
 2238                         break;
 2239 
 2240                 /*
 2241                  * If the file references a disk device, then fetch
 2242                  * the media size and use that to determine the ending
 2243                  * offset.
 2244                  */
 2245                 if (vattr.va_size == 0 && vp->v_type == VCHR &&
 2246                     fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 2247                         vattr.va_size = size;
 2248                 if (noneg &&
 2249                     (vattr.va_size > OFF_MAX ||
 2250                     (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 2251                         error = EOVERFLOW;
 2252                         break;
 2253                 }
 2254                 offset += vattr.va_size;
 2255                 break;
 2256         case L_SET:
 2257                 break;
 2258         case SEEK_DATA:
 2259                 error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 2260                 break;
 2261         case SEEK_HOLE:
 2262                 error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 2263                 break;
 2264         default:
 2265                 error = EINVAL;
 2266         }
 2267         if (error == 0 && noneg && offset < 0)
 2268                 error = EINVAL;
 2269         if (error != 0)
 2270                 goto drop;
 2271         VFS_KNOTE_UNLOCKED(vp, 0);
 2272         td->td_uretoff.tdu_off = offset;
 2273 drop:
 2274         foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 2275         return (error);
 2276 }
 2277 
 2278 int
 2279 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
 2280     struct thread *td)
 2281 {
 2282         int error;
 2283 
 2284         /*
 2285          * Grant permission if the caller is the owner of the file, or
 2286          * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 2287          * on the file.  If the time pointer is null, then write
 2288          * permission on the file is also sufficient.
 2289          *
 2290          * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 2291          * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 2292          * will be allowed to set the times [..] to the current
 2293          * server time.
 2294          */
 2295         error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 2296         if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 2297                 error = VOP_ACCESS(vp, VWRITE, cred, td);
 2298         return (error);
 2299 }
 2300 
 2301 int
 2302 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 2303 {
 2304         struct vnode *vp;
 2305         int error;
 2306 
 2307         if (fp->f_type == DTYPE_FIFO)
 2308                 kif->kf_type = KF_TYPE_FIFO;
 2309         else
 2310                 kif->kf_type = KF_TYPE_VNODE;
 2311         vp = fp->f_vnode;
 2312         vref(vp);
 2313         FILEDESC_SUNLOCK(fdp);
 2314         error = vn_fill_kinfo_vnode(vp, kif);
 2315         vrele(vp);
 2316         FILEDESC_SLOCK(fdp);
 2317         return (error);
 2318 }
 2319 
 2320 static inline void
 2321 vn_fill_junk(struct kinfo_file *kif)
 2322 {
 2323         size_t len, olen;
 2324 
 2325         /*
 2326          * Simulate vn_fullpath returning changing values for a given
 2327          * vp during e.g. coredump.
 2328          */
 2329         len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 2330         olen = strlen(kif->kf_path);
 2331         if (len < olen)
 2332                 strcpy(&kif->kf_path[len - 1], "$");
 2333         else
 2334                 for (; olen < len; olen++)
 2335                         strcpy(&kif->kf_path[olen], "A");
 2336 }
 2337 
 2338 int
 2339 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 2340 {
 2341         struct vattr va;
 2342         char *fullpath, *freepath;
 2343         int error;
 2344 
 2345         kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
 2346         freepath = NULL;
 2347         fullpath = "-";
 2348         error = vn_fullpath(curthread, vp, &fullpath, &freepath);
 2349         if (error == 0) {
 2350                 strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 2351         }
 2352         if (freepath != NULL)
 2353                 free(freepath, M_TEMP);
 2354 
 2355         KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 2356                 vn_fill_junk(kif);
 2357         );
 2358 
 2359         /*
 2360          * Retrieve vnode attributes.
 2361          */
 2362         va.va_fsid = VNOVAL;
 2363         va.va_rdev = NODEV;
 2364         vn_lock(vp, LK_SHARED | LK_RETRY);
 2365         error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 2366         VOP_UNLOCK(vp, 0);
 2367         if (error != 0)
 2368                 return (error);
 2369         if (va.va_fsid != VNOVAL)
 2370                 kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 2371         else
 2372                 kif->kf_un.kf_file.kf_file_fsid =
 2373                     vp->v_mount->mnt_stat.f_fsid.val[0];
 2374         kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 2375         kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 2376         kif->kf_un.kf_file.kf_file_size = va.va_size;
 2377         kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 2378         return (0);
 2379 }
 2380 
 2381 int
 2382 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
 2383     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
 2384     struct thread *td)
 2385 {
 2386 #ifdef HWPMC_HOOKS
 2387         struct pmckern_map_in pkm;
 2388 #endif
 2389         struct mount *mp;
 2390         struct vnode *vp;
 2391         vm_object_t object;
 2392         vm_prot_t maxprot;
 2393         boolean_t writecounted;
 2394         int error;
 2395 
 2396 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
 2397     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 2398         /*
 2399          * POSIX shared-memory objects are defined to have
 2400          * kernel persistence, and are not defined to support
 2401          * read(2)/write(2) -- or even open(2).  Thus, we can
 2402          * use MAP_ASYNC to trade on-disk coherence for speed.
 2403          * The shm_open(3) library routine turns on the FPOSIXSHM
 2404          * flag to request this behavior.
 2405          */
 2406         if ((fp->f_flag & FPOSIXSHM) != 0)
 2407                 flags |= MAP_NOSYNC;
 2408 #endif
 2409         vp = fp->f_vnode;
 2410 
 2411         /*
 2412          * Ensure that file and memory protections are
 2413          * compatible.  Note that we only worry about
 2414          * writability if mapping is shared; in this case,
 2415          * current and max prot are dictated by the open file.
 2416          * XXX use the vnode instead?  Problem is: what
 2417          * credentials do we use for determination? What if
 2418          * proc does a setuid?
 2419          */
 2420         mp = vp->v_mount;
 2421         if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 2422                 maxprot = VM_PROT_NONE;
 2423                 if ((prot & VM_PROT_EXECUTE) != 0)
 2424                         return (EACCES);
 2425         } else
 2426                 maxprot = VM_PROT_EXECUTE;
 2427         if ((fp->f_flag & FREAD) != 0)
 2428                 maxprot |= VM_PROT_READ;
 2429         else if ((prot & VM_PROT_READ) != 0)
 2430                 return (EACCES);
 2431 
 2432         /*
 2433          * If we are sharing potential changes via MAP_SHARED and we
 2434          * are trying to get write permission although we opened it
 2435          * without asking for it, bail out.
 2436          */
 2437         if ((flags & MAP_SHARED) != 0) {
 2438                 if ((fp->f_flag & FWRITE) != 0)
 2439                         maxprot |= VM_PROT_WRITE;
 2440                 else if ((prot & VM_PROT_WRITE) != 0)
 2441                         return (EACCES);
 2442         } else {
 2443                 maxprot |= VM_PROT_WRITE;
 2444                 cap_maxprot |= VM_PROT_WRITE;
 2445         }
 2446         maxprot &= cap_maxprot;
 2447 
 2448         /*
 2449          * For regular files and shared memory, POSIX requires that
 2450          * the value of foff be a legitimate offset within the data
 2451          * object.  In particular, negative offsets are invalid.
 2452          * Blocking negative offsets and overflows here avoids
 2453          * possible wraparound or user-level access into reserved
 2454          * ranges of the data object later.  In contrast, POSIX does
 2455          * not dictate how offsets are used by device drivers, so in
 2456          * the case of a device mapping a negative offset is passed
 2457          * on.
 2458          */
 2459         if (
 2460 #ifdef _LP64
 2461             size > OFF_MAX ||
 2462 #endif
 2463             foff < 0 || foff > OFF_MAX - size)
 2464                 return (EINVAL);
 2465 
 2466         writecounted = FALSE;
 2467         error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 2468             &foff, &object, &writecounted);
 2469         if (error != 0)
 2470                 return (error);
 2471         error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 2472             foff, writecounted, td);
 2473         if (error != 0) {
 2474                 /*
 2475                  * If this mapping was accounted for in the vnode's
 2476                  * writecount, then undo that now.
 2477                  */
 2478                 if (writecounted)
 2479                         vnode_pager_release_writecount(object, 0, size);
 2480                 vm_object_deallocate(object);
 2481         }
 2482 #ifdef HWPMC_HOOKS
 2483         /* Inform hwpmc(4) if an executable is being mapped. */
 2484         if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 2485                 if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 2486                         pkm.pm_file = vp;
 2487                         pkm.pm_address = (uintptr_t) *addr;
 2488                         PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
 2489                 }
 2490         }
 2491 #endif
 2492         return (error);
 2493 }
Cache object: a025246abad6f013323921b2c9528e4b
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c