vfs_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD: releng/8.4/sys/kern/vfs_vnops.c 239788 2012-08-28 18:45:20Z jhb $");
   39 
   40 #include <sys/param.h>
   41 #include <sys/systm.h>
   42 #include <sys/fcntl.h>
   43 #include <sys/file.h>
   44 #include <sys/kdb.h>
   45 #include <sys/stat.h>
   46 #include <sys/priv.h>
   47 #include <sys/proc.h>
   48 #include <sys/limits.h>
   49 #include <sys/lock.h>
   50 #include <sys/mount.h>
   51 #include <sys/mutex.h>
   52 #include <sys/namei.h>
   53 #include <sys/vnode.h>
   54 #include <sys/bio.h>
   55 #include <sys/buf.h>
   56 #include <sys/filio.h>
   57 #include <sys/resourcevar.h>
   58 #include <sys/sx.h>
   59 #include <sys/ttycom.h>
   60 #include <sys/conf.h>
   61 #include <sys/syslog.h>
   62 #include <sys/unistd.h>
   63 
   64 #include <security/mac/mac_framework.h>
   65 
   66 #include <vm/vm.h>
   67 #include <vm/vm_object.h>
   68 
   69 static fo_rdwr_t        vn_read;
   70 static fo_rdwr_t        vn_write;
   71 static fo_truncate_t    vn_truncate;
   72 static fo_ioctl_t       vn_ioctl;
   73 static fo_poll_t        vn_poll;
   74 static fo_kqfilter_t    vn_kqfilter;
   75 static fo_stat_t        vn_statfile;
   76 static fo_close_t       vn_closefile;
   77 
   78 struct  fileops vnops = {
   79         .fo_read = vn_read,
   80         .fo_write = vn_write,
   81         .fo_truncate = vn_truncate,
   82         .fo_ioctl = vn_ioctl,
   83         .fo_poll = vn_poll,
   84         .fo_kqfilter = vn_kqfilter,
   85         .fo_stat = vn_statfile,
   86         .fo_close = vn_closefile,
   87         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
   88 };
   89 
   90 int
   91 vn_open(ndp, flagp, cmode, fp)
   92         struct nameidata *ndp;
   93         int *flagp, cmode;
   94         struct file *fp;
   95 {
   96         struct thread *td = ndp->ni_cnd.cn_thread;
   97 
   98         return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
   99 }
  100 
  101 /*
  102  * Common code for vnode open operations.
  103  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  104  * 
  105  * Note that this does NOT free nameidata for the successful case,
  106  * due to the NDINIT being done elsewhere.
  107  */
  108 int
  109 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
  110     struct ucred *cred, struct file *fp)
  111 {
  112         struct vnode *vp;
  113         struct mount *mp;
  114         struct thread *td = ndp->ni_cnd.cn_thread;
  115         struct vattr vat;
  116         struct vattr *vap = &vat;
  117         int fmode, error;
  118         accmode_t accmode;
  119         int vfslocked, mpsafe;
  120 
  121         mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
  122 restart:
  123         vfslocked = 0;
  124         fmode = *flagp;
  125         if (fmode & O_CREAT) {
  126                 ndp->ni_cnd.cn_nameiop = CREATE;
  127                 ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
  128                     MPSAFE;
  129                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
  130                         ndp->ni_cnd.cn_flags |= FOLLOW;
  131                 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
  132                         ndp->ni_cnd.cn_flags |= AUDITVNODE1;
  133                 bwillwrite();
  134                 if ((error = namei(ndp)) != 0)
  135                         return (error);
  136                 vfslocked = NDHASGIANT(ndp);
  137                 if (!mpsafe)
  138                         ndp->ni_cnd.cn_flags &= ~MPSAFE;
  139                 if (ndp->ni_vp == NULL) {
  140                         VATTR_NULL(vap);
  141                         vap->va_type = VREG;
  142                         vap->va_mode = cmode;
  143                         if (fmode & O_EXCL)
  144                                 vap->va_vaflags |= VA_EXCLUSIVE;
  145                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
  146                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  147                                 vput(ndp->ni_dvp);
  148                                 VFS_UNLOCK_GIANT(vfslocked);
  149                                 if ((error = vn_start_write(NULL, &mp,
  150                                     V_XSLEEP | PCATCH)) != 0)
  151                                         return (error);
  152                                 goto restart;
  153                         }
  154 #ifdef MAC
  155                         error = mac_vnode_check_create(cred, ndp->ni_dvp,
  156                             &ndp->ni_cnd, vap);
  157                         if (error == 0)
  158 #endif
  159                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
  160                                                    &ndp->ni_cnd, vap);
  161                         vput(ndp->ni_dvp);
  162                         vn_finished_write(mp);
  163                         if (error) {
  164                                 VFS_UNLOCK_GIANT(vfslocked);
  165                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  166                                 return (error);
  167                         }
  168                         fmode &= ~O_TRUNC;
  169                         vp = ndp->ni_vp;
  170                 } else {
  171                         if (ndp->ni_dvp == ndp->ni_vp)
  172                                 vrele(ndp->ni_dvp);
  173                         else
  174                                 vput(ndp->ni_dvp);
  175                         ndp->ni_dvp = NULL;
  176                         vp = ndp->ni_vp;
  177                         if (fmode & O_EXCL) {
  178                                 error = EEXIST;
  179                                 goto bad;
  180                         }
  181                         fmode &= ~O_CREAT;
  182                 }
  183         } else {
  184                 ndp->ni_cnd.cn_nameiop = LOOKUP;
  185                 ndp->ni_cnd.cn_flags = ISOPEN |
  186                     ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
  187                     LOCKLEAF | MPSAFE;
  188                 if (!(fmode & FWRITE))
  189                         ndp->ni_cnd.cn_flags |= LOCKSHARED;
  190                 if (!(vn_open_flags & VN_OPEN_NOAUDIT))
  191                         ndp->ni_cnd.cn_flags |= AUDITVNODE1;
  192                 if ((error = namei(ndp)) != 0)
  193                         return (error);
  194                 if (!mpsafe)
  195                         ndp->ni_cnd.cn_flags &= ~MPSAFE;
  196                 vfslocked = NDHASGIANT(ndp);
  197                 vp = ndp->ni_vp;
  198         }
  199         if (vp->v_type == VLNK) {
  200                 error = EMLINK;
  201                 goto bad;
  202         }
  203         if (vp->v_type == VSOCK) {
  204                 error = EOPNOTSUPP;
  205                 goto bad;
  206         }
  207         accmode = 0;
  208         if (fmode & (FWRITE | O_TRUNC)) {
  209                 if (vp->v_type == VDIR) {
  210                         error = EISDIR;
  211                         goto bad;
  212                 }
  213                 accmode |= VWRITE;
  214         }
  215         if (fmode & FREAD)
  216                 accmode |= VREAD;
  217         if (fmode & FEXEC)
  218                 accmode |= VEXEC;
  219         if ((fmode & O_APPEND) && (fmode & FWRITE))
  220                 accmode |= VAPPEND;
  221 #ifdef MAC
  222         error = mac_vnode_check_open(cred, vp, accmode);
  223         if (error)
  224                 goto bad;
  225 #endif
  226         if ((fmode & O_CREAT) == 0) {
  227                 if (accmode & VWRITE) {
  228                         error = vn_writechk(vp);
  229                         if (error)
  230                                 goto bad;
  231                 }
  232                 if (accmode) {
  233                         error = VOP_ACCESS(vp, accmode, cred, td);
  234                         if (error)
  235                                 goto bad;
  236                 }
  237         }
  238         if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
  239                 goto bad;
  240 
  241         if (fmode & FWRITE)
  242                 vp->v_writecount++;
  243         *flagp = fmode;
  244         ASSERT_VOP_LOCKED(vp, "vn_open_cred");
  245         if (!mpsafe)
  246                 VFS_UNLOCK_GIANT(vfslocked);
  247         return (0);
  248 bad:
  249         NDFREE(ndp, NDF_ONLY_PNBUF);
  250         vput(vp);
  251         VFS_UNLOCK_GIANT(vfslocked);
  252         *flagp = fmode;
  253         ndp->ni_vp = NULL;
  254         return (error);
  255 }
  256 
  257 /*
  258  * Check for write permissions on the specified vnode.
  259  * Prototype text segments cannot be written.
  260  */
  261 int
  262 vn_writechk(vp)
  263         register struct vnode *vp;
  264 {
  265 
  266         ASSERT_VOP_LOCKED(vp, "vn_writechk");
  267         /*
  268          * If there's shared text associated with
  269          * the vnode, try to free it up once.  If
  270          * we fail, we can't allow writing.
  271          */
  272         if (vp->v_vflag & VV_TEXT)
  273                 return (ETXTBSY);
  274 
  275         return (0);
  276 }
  277 
  278 /*
  279  * Vnode close call
  280  */
  281 int
  282 vn_close(vp, flags, file_cred, td)
  283         register struct vnode *vp;
  284         int flags;
  285         struct ucred *file_cred;
  286         struct thread *td;
  287 {
  288         struct mount *mp;
  289         int error, lock_flags;
  290 
  291         if (!(flags & FWRITE) && vp->v_mount != NULL &&
  292             vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
  293                 lock_flags = LK_SHARED;
  294         else
  295                 lock_flags = LK_EXCLUSIVE;
  296 
  297         VFS_ASSERT_GIANT(vp->v_mount);
  298 
  299         vn_start_write(vp, &mp, V_WAIT);
  300         vn_lock(vp, lock_flags | LK_RETRY);
  301         if (flags & FWRITE) {
  302                 VNASSERT(vp->v_writecount > 0, vp, 
  303                     ("vn_close: negative writecount"));
  304                 vp->v_writecount--;
  305         }
  306         error = VOP_CLOSE(vp, flags, file_cred, td);
  307         vput(vp);
  308         vn_finished_write(mp);
  309         return (error);
  310 }
  311 
  312 /*
  313  * Heuristic to detect sequential operation.
  314  */
  315 static int
  316 sequential_heuristic(struct uio *uio, struct file *fp)
  317 {
  318 
  319         if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
  320                 return (fp->f_seqcount << IO_SEQSHIFT);
  321 
  322         /*
  323          * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
  324          * that the first I/O is normally considered to be slightly
  325          * sequential.  Seeking to offset 0 doesn't change sequentiality
  326          * unless previous seeks have reduced f_seqcount to 0, in which
  327          * case offset 0 is not special.
  328          */
  329         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
  330             uio->uio_offset == fp->f_nextoff) {
  331                 /*
  332                  * f_seqcount is in units of fixed-size blocks so that it
  333                  * depends mainly on the amount of sequential I/O and not
  334                  * much on the number of sequential I/O's.  The fixed size
  335                  * of 16384 is hard-coded here since it is (not quite) just
  336                  * a magic size that works well here.  This size is more
  337                  * closely related to the best I/O size for real disks than
  338                  * to any block size used by software.
  339                  */
  340                 fp->f_seqcount += howmany(uio->uio_resid, 16384);
  341                 if (fp->f_seqcount > IO_SEQMAX)
  342                         fp->f_seqcount = IO_SEQMAX;
  343                 return (fp->f_seqcount << IO_SEQSHIFT);
  344         }
  345 
  346         /* Not sequential.  Quickly draw-down sequentiality. */
  347         if (fp->f_seqcount > 1)
  348                 fp->f_seqcount = 1;
  349         else
  350                 fp->f_seqcount = 0;
  351         return (0);
  352 }
  353 
  354 /*
  355  * Package up an I/O request on a vnode into a uio and do it.
  356  */
  357 int
  358 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
  359     aresid, td)
  360         enum uio_rw rw;
  361         struct vnode *vp;
  362         void *base;
  363         int len;
  364         off_t offset;
  365         enum uio_seg segflg;
  366         int ioflg;
  367         struct ucred *active_cred;
  368         struct ucred *file_cred;
  369         int *aresid;
  370         struct thread *td;
  371 {
  372         struct uio auio;
  373         struct iovec aiov;
  374         struct mount *mp;
  375         struct ucred *cred;
  376         int error, lock_flags;
  377 
  378         VFS_ASSERT_GIANT(vp->v_mount);
  379 
  380         if ((ioflg & IO_NODELOCKED) == 0) {
  381                 mp = NULL;
  382                 if (rw == UIO_WRITE) { 
  383                         if (vp->v_type != VCHR &&
  384                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
  385                             != 0)
  386                                 return (error);
  387                         if (MNT_SHARED_WRITES(mp) ||
  388                             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
  389                                 lock_flags = LK_SHARED;
  390                         } else {
  391                                 lock_flags = LK_EXCLUSIVE;
  392                         }
  393                         vn_lock(vp, lock_flags | LK_RETRY);
  394                 } else
  395                         vn_lock(vp, LK_SHARED | LK_RETRY);
  396 
  397         }
  398         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
  399         auio.uio_iov = &aiov;
  400         auio.uio_iovcnt = 1;
  401         aiov.iov_base = base;
  402         aiov.iov_len = len;
  403         auio.uio_resid = len;
  404         auio.uio_offset = offset;
  405         auio.uio_segflg = segflg;
  406         auio.uio_rw = rw;
  407         auio.uio_td = td;
  408         error = 0;
  409 #ifdef MAC
  410         if ((ioflg & IO_NOMACCHECK) == 0) {
  411                 if (rw == UIO_READ)
  412                         error = mac_vnode_check_read(active_cred, file_cred,
  413                             vp);
  414                 else
  415                         error = mac_vnode_check_write(active_cred, file_cred,
  416                             vp);
  417         }
  418 #endif
  419         if (error == 0) {
  420                 if (file_cred)
  421                         cred = file_cred;
  422                 else
  423                         cred = active_cred;
  424                 if (rw == UIO_READ)
  425                         error = VOP_READ(vp, &auio, ioflg, cred);
  426                 else
  427                         error = VOP_WRITE(vp, &auio, ioflg, cred);
  428         }
  429         if (aresid)
  430                 *aresid = auio.uio_resid;
  431         else
  432                 if (auio.uio_resid && error == 0)
  433                         error = EIO;
  434         if ((ioflg & IO_NODELOCKED) == 0) {
  435                 if (rw == UIO_WRITE && vp->v_type != VCHR)
  436                         vn_finished_write(mp);
  437                 VOP_UNLOCK(vp, 0);
  438         }
  439         return (error);
  440 }
  441 
  442 /*
  443  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  444  * request is split up into smaller chunks and we try to avoid saturating
  445  * the buffer cache while potentially holding a vnode locked, so we 
  446  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
  447  * to give other processes a chance to lock the vnode (either other processes
  448  * core'ing the same binary, or unrelated processes scanning the directory).
  449  */
  450 int
  451 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
  452     file_cred, aresid, td)
  453         enum uio_rw rw;
  454         struct vnode *vp;
  455         void *base;
  456         size_t len;
  457         off_t offset;
  458         enum uio_seg segflg;
  459         int ioflg;
  460         struct ucred *active_cred;
  461         struct ucred *file_cred;
  462         size_t *aresid;
  463         struct thread *td;
  464 {
  465         int error = 0;
  466         int iaresid;
  467 
  468         VFS_ASSERT_GIANT(vp->v_mount);
  469 
  470         do {
  471                 int chunk;
  472 
  473                 /*
  474                  * Force `offset' to a multiple of MAXBSIZE except possibly
  475                  * for the first chunk, so that filesystems only need to
  476                  * write full blocks except possibly for the first and last
  477                  * chunks.
  478                  */
  479                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
  480 
  481                 if (chunk > len)
  482                         chunk = len;
  483                 if (rw != UIO_READ && vp->v_type == VREG)
  484                         bwillwrite();
  485                 iaresid = 0;
  486                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
  487                     ioflg, active_cred, file_cred, &iaresid, td);
  488                 len -= chunk;   /* aresid calc already includes length */
  489                 if (error)
  490                         break;
  491                 offset += chunk;
  492                 base = (char *)base + chunk;
  493                 uio_yield();
  494         } while (len);
  495         if (aresid)
  496                 *aresid = len + iaresid;
  497         return (error);
  498 }
  499 
  500 /*
  501  * File table vnode read routine.
  502  */
  503 static int
  504 vn_read(fp, uio, active_cred, flags, td)
  505         struct file *fp;
  506         struct uio *uio;
  507         struct ucred *active_cred;
  508         struct thread *td;
  509         int flags;
  510 {
  511         struct vnode *vp;
  512         int error, ioflag;
  513         struct mtx *mtxp;
  514         int advice, vfslocked;
  515         off_t offset, start, end;
  516 
  517         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  518             uio->uio_td, td));
  519         mtxp = NULL;
  520         vp = fp->f_vnode;
  521         ioflag = 0;
  522         if (fp->f_flag & FNONBLOCK)
  523                 ioflag |= IO_NDELAY;
  524         if (fp->f_flag & O_DIRECT)
  525                 ioflag |= IO_DIRECT;
  526         advice = POSIX_FADV_NORMAL;
  527         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  528         /*
  529          * According to McKusick the vn lock was protecting f_offset here.
  530          * It is now protected by the FOFFSET_LOCKED flag.
  531          */
  532         if ((flags & FOF_OFFSET) == 0 || fp->f_advice != NULL) {
  533                 mtxp = mtx_pool_find(mtxpool_sleep, fp);
  534                 mtx_lock(mtxp);
  535                 if ((flags & FOF_OFFSET) == 0) {
  536                         while (fp->f_vnread_flags & FOFFSET_LOCKED) {
  537                                 fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
  538                                 msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
  539                                     "vnread offlock", 0);
  540                         }
  541                         fp->f_vnread_flags |= FOFFSET_LOCKED;
  542                         uio->uio_offset = fp->f_offset;
  543                 }
  544                 if (fp->f_advice != NULL &&
  545                     uio->uio_offset >= fp->f_advice->fa_start &&
  546                     uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
  547                         advice = fp->f_advice->fa_advice;
  548                 mtx_unlock(mtxp);
  549         }
  550         vn_lock(vp, LK_SHARED | LK_RETRY);
  551 
  552         switch (advice) {
  553         case POSIX_FADV_NORMAL:
  554         case POSIX_FADV_SEQUENTIAL:
  555         case POSIX_FADV_NOREUSE:
  556                 ioflag |= sequential_heuristic(uio, fp);
  557                 break;
  558         case POSIX_FADV_RANDOM:
  559                 /* Disable read-ahead for random I/O. */
  560                 break;
  561         }
  562         offset = uio->uio_offset;
  563 
  564 #ifdef MAC
  565         error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
  566         if (error == 0)
  567 #endif
  568                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
  569         if ((flags & FOF_OFFSET) == 0) {
  570                 fp->f_offset = uio->uio_offset;
  571                 mtx_lock(mtxp);
  572                 if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
  573                         wakeup(&fp->f_vnread_flags);
  574                 fp->f_vnread_flags = 0;
  575                 mtx_unlock(mtxp);
  576         }
  577         fp->f_nextoff = uio->uio_offset;
  578         VOP_UNLOCK(vp, 0);
  579         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
  580             offset != uio->uio_offset) {
  581                 /*
  582                  * Use POSIX_FADV_DONTNEED to flush clean pages and
  583                  * buffers for the backing file after a
  584                  * POSIX_FADV_NOREUSE read(2).  To optimize the common
  585                  * case of using POSIX_FADV_NOREUSE with sequential
  586                  * access, track the previous implicit DONTNEED
  587                  * request and grow this request to include the
  588                  * current read(2) in addition to the previous
  589                  * DONTNEED.  With purely sequential access this will
  590                  * cause the DONTNEED requests to continously grow to
  591                  * cover all of the previously read regions of the
  592                  * file.  This allows filesystem blocks that are
  593                  * accessed by multiple calls to read(2) to be flushed
  594                  * once the last read(2) finishes.
  595                  */
  596                 start = offset;
  597                 end = uio->uio_offset - 1;
  598                 mtx_lock(mtxp);
  599                 if (fp->f_advice != NULL &&
  600                     fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
  601                         if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
  602                                 start = fp->f_advice->fa_prevstart;
  603                         else if (fp->f_advice->fa_prevstart != 0 &&
  604                             fp->f_advice->fa_prevstart == end + 1)
  605                                 end = fp->f_advice->fa_prevend;
  606                         fp->f_advice->fa_prevstart = start;
  607                         fp->f_advice->fa_prevend = end;
  608                 }
  609                 mtx_unlock(mtxp);
  610                 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
  611         }
  612         VFS_UNLOCK_GIANT(vfslocked);
  613         return (error);
  614 }
  615 
  616 /*
  617  * File table vnode write routine.
  618  */
  619 static int
  620 vn_write(fp, uio, active_cred, flags, td)
  621         struct file *fp;
  622         struct uio *uio;
  623         struct ucred *active_cred;
  624         struct thread *td;
  625         int flags;
  626 {
  627         struct vnode *vp;
  628         struct mount *mp;
  629         int error, ioflag, lock_flags;
  630         struct mtx *mtxp;
  631         int advice, vfslocked;
  632         off_t offset, start, end;
  633 
  634         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  635             uio->uio_td, td));
  636         vp = fp->f_vnode;
  637         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  638         if (vp->v_type == VREG)
  639                 bwillwrite();
  640         ioflag = IO_UNIT;
  641         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
  642                 ioflag |= IO_APPEND;
  643         if (fp->f_flag & FNONBLOCK)
  644                 ioflag |= IO_NDELAY;
  645         if (fp->f_flag & O_DIRECT)
  646                 ioflag |= IO_DIRECT;
  647         if ((fp->f_flag & O_FSYNC) ||
  648             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
  649                 ioflag |= IO_SYNC;
  650         mp = NULL;
  651         if (vp->v_type != VCHR &&
  652             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
  653                 goto unlock;
  654  
  655         if ((MNT_SHARED_WRITES(mp) ||
  656             ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
  657             (flags & FOF_OFFSET) != 0) {
  658                 lock_flags = LK_SHARED;
  659         } else {
  660                 lock_flags = LK_EXCLUSIVE;
  661         }
  662 
  663         vn_lock(vp, lock_flags | LK_RETRY);
  664         if ((flags & FOF_OFFSET) == 0)
  665                 uio->uio_offset = fp->f_offset;
  666         advice = POSIX_FADV_NORMAL;
  667         mtxp = NULL;
  668         if (fp->f_advice != NULL) {
  669                 mtxp = mtx_pool_find(mtxpool_sleep, fp);
  670                 mtx_lock(mtxp);
  671                 if (fp->f_advice != NULL &&
  672                     uio->uio_offset >= fp->f_advice->fa_start &&
  673                     uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
  674                         advice = fp->f_advice->fa_advice;
  675                 mtx_unlock(mtxp);
  676         }
  677         switch (advice) {
  678         case POSIX_FADV_NORMAL:
  679         case POSIX_FADV_SEQUENTIAL:
  680         case POSIX_FADV_NOREUSE:
  681                 ioflag |= sequential_heuristic(uio, fp);
  682                 break;
  683         case POSIX_FADV_RANDOM:
  684                 /* XXX: Is this correct? */
  685                 break;
  686         }
  687         offset = uio->uio_offset;
  688 
  689 #ifdef MAC
  690         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
  691         if (error == 0)
  692 #endif
  693                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
  694         if ((flags & FOF_OFFSET) == 0)
  695                 fp->f_offset = uio->uio_offset;
  696         fp->f_nextoff = uio->uio_offset;
  697         VOP_UNLOCK(vp, 0);
  698         if (vp->v_type != VCHR)
  699                 vn_finished_write(mp);
  700         if (error == 0 && advice == POSIX_FADV_NOREUSE &&
  701             offset != uio->uio_offset) {
  702                 /*
  703                  * Use POSIX_FADV_DONTNEED to flush clean pages and
  704                  * buffers for the backing file after a
  705                  * POSIX_FADV_NOREUSE write(2).  To optimize the
  706                  * common case of using POSIX_FADV_NOREUSE with
  707                  * sequential access, track the previous implicit
  708                  * DONTNEED request and grow this request to include
  709                  * the current write(2) in addition to the previous
  710                  * DONTNEED.  With purely sequential access this will
  711                  * cause the DONTNEED requests to continously grow to
  712                  * cover all of the previously written regions of the
  713                  * file.
  714                  *
  715                  * Note that the blocks just written are almost
  716                  * certainly still dirty, so this only works when
  717                  * VOP_ADVISE() calls from subsequent writes push out
  718                  * the data written by this write(2) once the backing
  719                  * buffers are clean.  However, as compared to forcing
  720                  * IO_DIRECT, this gives much saner behavior.  Write
  721                  * clustering is still allowed, and clean pages are
  722                  * merely moved to the cache page queue rather than
  723                  * outright thrown away.  This means a subsequent
  724                  * read(2) can still avoid hitting the disk if the
  725                  * pages have not been reclaimed.
  726                  *
  727                  * This does make POSIX_FADV_NOREUSE largely useless
  728                  * with non-sequential access.  However, sequential
  729                  * access is the more common use case and the flag is
  730                  * merely advisory.
  731                  */
  732                 start = offset;
  733                 end = uio->uio_offset - 1;
  734                 mtx_lock(mtxp);
  735                 if (fp->f_advice != NULL &&
  736                     fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) {
  737                         if (start != 0 && fp->f_advice->fa_prevend + 1 == start)
  738                                 start = fp->f_advice->fa_prevstart;
  739                         else if (fp->f_advice->fa_prevstart != 0 &&
  740                             fp->f_advice->fa_prevstart == end + 1)
  741                                 end = fp->f_advice->fa_prevend;
  742                         fp->f_advice->fa_prevstart = start;
  743                         fp->f_advice->fa_prevend = end;
  744                 }
  745                 mtx_unlock(mtxp);
  746                 error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED);
  747         }
  748         
  749 unlock:
  750         VFS_UNLOCK_GIANT(vfslocked);
  751         return (error);
  752 }
  753 
  754 /*
  755  * File table truncate routine.
  756  */
  757 static int
  758 vn_truncate(fp, length, active_cred, td)
  759         struct file *fp;
  760         off_t length;
  761         struct ucred *active_cred;
  762         struct thread *td;
  763 {
  764         struct vattr vattr;
  765         struct mount *mp;
  766         struct vnode *vp;
  767         int vfslocked;
  768         int error;
  769 
  770         vp = fp->f_vnode;
  771         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  772         error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
  773         if (error) {
  774                 VFS_UNLOCK_GIANT(vfslocked);
  775                 return (error);
  776         }
  777         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  778         if (vp->v_type == VDIR) {
  779                 error = EISDIR;
  780                 goto out;
  781         }
  782 #ifdef MAC
  783         error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
  784         if (error)
  785                 goto out;
  786 #endif
  787         error = vn_writechk(vp);
  788         if (error == 0) {
  789                 VATTR_NULL(&vattr);
  790                 vattr.va_size = length;
  791                 error = VOP_SETATTR(vp, &vattr, fp->f_cred);
  792         }
  793 out:
  794         VOP_UNLOCK(vp, 0);
  795         vn_finished_write(mp);
  796         VFS_UNLOCK_GIANT(vfslocked);
  797         return (error);
  798 }
  799 
  800 /*
  801  * File table vnode stat routine.
  802  */
  803 static int
  804 vn_statfile(fp, sb, active_cred, td)
  805         struct file *fp;
  806         struct stat *sb;
  807         struct ucred *active_cred;
  808         struct thread *td;
  809 {
  810         struct vnode *vp = fp->f_vnode;
  811         int vfslocked;
  812         int error;
  813 
  814         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  815         vn_lock(vp, LK_SHARED | LK_RETRY);
  816         error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
  817         VOP_UNLOCK(vp, 0);
  818         VFS_UNLOCK_GIANT(vfslocked);
  819 
  820         return (error);
  821 }
  822 
  823 /*
  824  * Stat a vnode; implementation for the stat syscall
  825  */
  826 int
  827 vn_stat(vp, sb, active_cred, file_cred, td)
  828         struct vnode *vp;
  829         register struct stat *sb;
  830         struct ucred *active_cred;
  831         struct ucred *file_cred;
  832         struct thread *td;
  833 {
  834         struct vattr vattr;
  835         register struct vattr *vap;
  836         int error;
  837         u_short mode;
  838 
  839 #ifdef MAC
  840         error = mac_vnode_check_stat(active_cred, file_cred, vp);
  841         if (error)
  842                 return (error);
  843 #endif
  844 
  845         vap = &vattr;
  846 
  847         /*
  848          * Initialize defaults for new and unusual fields, so that file
  849          * systems which don't support these fields don't need to know
  850          * about them.
  851          */
  852         vap->va_birthtime.tv_sec = -1;
  853         vap->va_birthtime.tv_nsec = 0;
  854         vap->va_fsid = VNOVAL;
  855         vap->va_rdev = NODEV;
  856 
  857         error = VOP_GETATTR(vp, vap, active_cred);
  858         if (error)
  859                 return (error);
  860 
  861         /*
  862          * Zero the spare stat fields
  863          */
  864         bzero(sb, sizeof *sb);
  865 
  866         /*
  867          * Copy from vattr table
  868          */
  869         if (vap->va_fsid != VNOVAL)
  870                 sb->st_dev = vap->va_fsid;
  871         else
  872                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
  873         sb->st_ino = vap->va_fileid;
  874         mode = vap->va_mode;
  875         switch (vap->va_type) {
  876         case VREG:
  877                 mode |= S_IFREG;
  878                 break;
  879         case VDIR:
  880                 mode |= S_IFDIR;
  881                 break;
  882         case VBLK:
  883                 mode |= S_IFBLK;
  884                 break;
  885         case VCHR:
  886                 mode |= S_IFCHR;
  887                 break;
  888         case VLNK:
  889                 mode |= S_IFLNK;
  890                 break;
  891         case VSOCK:
  892                 mode |= S_IFSOCK;
  893                 break;
  894         case VFIFO:
  895                 mode |= S_IFIFO;
  896                 break;
  897         default:
  898                 return (EBADF);
  899         };
  900         sb->st_mode = mode;
  901         sb->st_nlink = vap->va_nlink;
  902         sb->st_uid = vap->va_uid;
  903         sb->st_gid = vap->va_gid;
  904         sb->st_rdev = vap->va_rdev;
  905         if (vap->va_size > OFF_MAX)
  906                 return (EOVERFLOW);
  907         sb->st_size = vap->va_size;
  908         sb->st_atimespec = vap->va_atime;
  909         sb->st_mtimespec = vap->va_mtime;
  910         sb->st_ctimespec = vap->va_ctime;
  911         sb->st_birthtimespec = vap->va_birthtime;
  912 
  913         /*
  914          * According to www.opengroup.org, the meaning of st_blksize is 
  915          *   "a filesystem-specific preferred I/O block size for this 
  916          *    object.  In some filesystem types, this may vary from file
  917          *    to file"
  918          * Use miminum/default of PAGE_SIZE (e.g. for VCHR).
  919          */
  920 
  921         sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);
  922         
  923         sb->st_flags = vap->va_flags;
  924         if (priv_check(td, PRIV_VFS_GENERATION))
  925                 sb->st_gen = 0;
  926         else
  927                 sb->st_gen = vap->va_gen;
  928 
  929         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
  930         return (0);
  931 }
  932 
  933 /*
  934  * File table vnode ioctl routine.
  935  */
  936 static int
  937 vn_ioctl(fp, com, data, active_cred, td)
  938         struct file *fp;
  939         u_long com;
  940         void *data;
  941         struct ucred *active_cred;
  942         struct thread *td;
  943 {
  944         struct vnode *vp = fp->f_vnode;
  945         struct vattr vattr;
  946         int vfslocked;
  947         int error;
  948 
  949         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  950         error = ENOTTY;
  951         switch (vp->v_type) {
  952         case VREG:
  953         case VDIR:
  954                 if (com == FIONREAD) {
  955                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  956                         error = VOP_GETATTR(vp, &vattr, active_cred);
  957                         VOP_UNLOCK(vp, 0);
  958                         if (!error)
  959                                 *(int *)data = vattr.va_size - fp->f_offset;
  960                 }
  961                 if (com == FIONBIO || com == FIOASYNC)  /* XXX */
  962                         error = 0;
  963                 else
  964                         error = VOP_IOCTL(vp, com, data, fp->f_flag,
  965                             active_cred, td);
  966                 break;
  967 
  968         default:
  969                 break;
  970         }
  971         VFS_UNLOCK_GIANT(vfslocked);
  972         return (error);
  973 }
  974 
  975 /*
  976  * File table vnode poll routine.
  977  */
  978 static int
  979 vn_poll(fp, events, active_cred, td)
  980         struct file *fp;
  981         int events;
  982         struct ucred *active_cred;
  983         struct thread *td;
  984 {
  985         struct vnode *vp;
  986         int vfslocked;
  987         int error;
  988 
  989         vp = fp->f_vnode;
  990         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
  991 #ifdef MAC
  992         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
  993         error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
  994         VOP_UNLOCK(vp, 0);
  995         if (!error)
  996 #endif
  997 
  998         error = VOP_POLL(vp, events, fp->f_cred, td);
  999         VFS_UNLOCK_GIANT(vfslocked);
 1000         return (error);
 1001 }
 1002 
 1003 /*
 1004  * Acquire the requested lock and then check for validity.  LK_RETRY
 1005  * permits vn_lock to return doomed vnodes.
 1006  */
 1007 int
 1008 _vn_lock(struct vnode *vp, int flags, char *file, int line)
 1009 {
 1010         int error;
 1011 
 1012         VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 1013             ("vn_lock called with no locktype."));
 1014         do {
 1015 #ifdef DEBUG_VFS_LOCKS
 1016                 KASSERT(vp->v_holdcnt != 0,
 1017                     ("vn_lock %p: zero hold count", vp));
 1018 #endif
 1019                 error = VOP_LOCK1(vp, flags, file, line);
 1020                 flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
 1021                 KASSERT((flags & LK_RETRY) == 0 || error == 0,
 1022                     ("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
 1023                     flags, error));
 1024                 /*
 1025                  * Callers specify LK_RETRY if they wish to get dead vnodes.
 1026                  * If RETRY is not set, we return ENOENT instead.
 1027                  */
 1028                 if (error == 0 && vp->v_iflag & VI_DOOMED &&
 1029                     (flags & LK_RETRY) == 0) {
 1030                         VOP_UNLOCK(vp, 0);
 1031                         error = ENOENT;
 1032                         break;
 1033                 }
 1034         } while (flags & LK_RETRY && error != 0);
 1035         return (error);
 1036 }
 1037 
 1038 /*
 1039  * File table vnode close routine.
 1040  */
 1041 static int
 1042 vn_closefile(fp, td)
 1043         struct file *fp;
 1044         struct thread *td;
 1045 {
 1046         struct vnode *vp;
 1047         struct flock lf;
 1048         int vfslocked;
 1049         int error;
 1050 
 1051         vp = fp->f_vnode;
 1052 
 1053         vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 1054         if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
 1055                 lf.l_whence = SEEK_SET;
 1056                 lf.l_start = 0;
 1057                 lf.l_len = 0;
 1058                 lf.l_type = F_UNLCK;
 1059                 (void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 1060         }
 1061 
 1062         fp->f_ops = &badfileops;
 1063 
 1064         error = vn_close(vp, fp->f_flag, fp->f_cred, td);
 1065         VFS_UNLOCK_GIANT(vfslocked);
 1066         return (error);
 1067 }
 1068 
 1069 /*
 1070  * Preparing to start a filesystem write operation. If the operation is
 1071  * permitted, then we bump the count of operations in progress and
 1072  * proceed. If a suspend request is in progress, we wait until the
 1073  * suspension is over, and then proceed.
 1074  */
 1075 int
 1076 vn_start_write(vp, mpp, flags)
 1077         struct vnode *vp;
 1078         struct mount **mpp;
 1079         int flags;
 1080 {
 1081         struct mount *mp;
 1082         int error;
 1083 
 1084         error = 0;
 1085         /*
 1086          * If a vnode is provided, get and return the mount point that
 1087          * to which it will write.
 1088          */
 1089         if (vp != NULL) {
 1090                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1091                         *mpp = NULL;
 1092                         if (error != EOPNOTSUPP)
 1093                                 return (error);
 1094                         return (0);
 1095                 }
 1096         }
 1097         if ((mp = *mpp) == NULL)
 1098                 return (0);
 1099 
 1100         /*
 1101          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1102          * a vfs_ref().
 1103          * As long as a vnode is not provided we need to acquire a
 1104          * refcount for the provided mountpoint too, in order to
 1105          * emulate a vfs_ref().
 1106          */
 1107         MNT_ILOCK(mp);
 1108         if (vp == NULL)
 1109                 MNT_REF(mp);
 1110 
 1111         /*
 1112          * Check on status of suspension.
 1113          */
 1114         if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 1115             mp->mnt_susp_owner != curthread) {
 1116                 while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 1117                         if (flags & V_NOWAIT) {
 1118                                 error = EWOULDBLOCK;
 1119                                 goto unlock;
 1120                         }
 1121                         error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 1122                             (PUSER - 1) | (flags & PCATCH), "suspfs", 0);
 1123                         if (error)
 1124                                 goto unlock;
 1125                 }
 1126         }
 1127         if (flags & V_XSLEEP)
 1128                 goto unlock;
 1129         mp->mnt_writeopcount++;
 1130 unlock:
 1131         if (error != 0 || (flags & V_XSLEEP) != 0)
 1132                 MNT_REL(mp);
 1133         MNT_IUNLOCK(mp);
 1134         return (error);
 1135 }
 1136 
 1137 /*
 1138  * Secondary suspension. Used by operations such as vop_inactive
 1139  * routines that are needed by the higher level functions. These
 1140  * are allowed to proceed until all the higher level functions have
 1141  * completed (indicated by mnt_writeopcount dropping to zero). At that
 1142  * time, these operations are halted until the suspension is over.
 1143  */
 1144 int
 1145 vn_start_secondary_write(vp, mpp, flags)
 1146         struct vnode *vp;
 1147         struct mount **mpp;
 1148         int flags;
 1149 {
 1150         struct mount *mp;
 1151         int error;
 1152 
 1153  retry:
 1154         if (vp != NULL) {
 1155                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 1156                         *mpp = NULL;
 1157                         if (error != EOPNOTSUPP)
 1158                                 return (error);
 1159                         return (0);
 1160                 }
 1161         }
 1162         /*
 1163          * If we are not suspended or have not yet reached suspended
 1164          * mode, then let the operation proceed.
 1165          */
 1166         if ((mp = *mpp) == NULL)
 1167                 return (0);
 1168 
 1169         /*
 1170          * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 1171          * a vfs_ref().
 1172          * As long as a vnode is not provided we need to acquire a
 1173          * refcount for the provided mountpoint too, in order to
 1174          * emulate a vfs_ref().
 1175          */
 1176         MNT_ILOCK(mp);
 1177         if (vp == NULL)
 1178                 MNT_REF(mp);
 1179         if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 1180                 mp->mnt_secondary_writes++;
 1181                 mp->mnt_secondary_accwrites++;
 1182                 MNT_IUNLOCK(mp);
 1183                 return (0);
 1184         }
 1185         if (flags & V_NOWAIT) {
 1186                 MNT_REL(mp);
 1187                 MNT_IUNLOCK(mp);
 1188                 return (EWOULDBLOCK);
 1189         }
 1190         /*
 1191          * Wait for the suspension to finish.
 1192          */
 1193         error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 1194                        (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
 1195         vfs_rel(mp);
 1196         if (error == 0)
 1197                 goto retry;
 1198         return (error);
 1199 }
 1200 
 1201 /*
 1202  * Filesystem write operation has completed. If we are suspending and this
 1203  * operation is the last one, notify the suspender that the suspension is
 1204  * now in effect.
 1205  */
 1206 void
 1207 vn_finished_write(mp)
 1208         struct mount *mp;
 1209 {
 1210         if (mp == NULL)
 1211                 return;
 1212         MNT_ILOCK(mp);
 1213         MNT_REL(mp);
 1214         mp->mnt_writeopcount--;
 1215         if (mp->mnt_writeopcount < 0)
 1216                 panic("vn_finished_write: neg cnt");
 1217         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 1218             mp->mnt_writeopcount <= 0)
 1219                 wakeup(&mp->mnt_writeopcount);
 1220         MNT_IUNLOCK(mp);
 1221 }
 1222 
 1223 
 1224 /*
 1225  * Filesystem secondary write operation has completed. If we are
 1226  * suspending and this operation is the last one, notify the suspender
 1227  * that the suspension is now in effect.
 1228  */
 1229 void
 1230 vn_finished_secondary_write(mp)
 1231         struct mount *mp;
 1232 {
 1233         if (mp == NULL)
 1234                 return;
 1235         MNT_ILOCK(mp);
 1236         MNT_REL(mp);
 1237         mp->mnt_secondary_writes--;
 1238         if (mp->mnt_secondary_writes < 0)
 1239                 panic("vn_finished_secondary_write: neg cnt");
 1240         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 1241             mp->mnt_secondary_writes <= 0)
 1242                 wakeup(&mp->mnt_secondary_writes);
 1243         MNT_IUNLOCK(mp);
 1244 }
 1245 
 1246 
 1247 
 1248 /*
 1249  * Request a filesystem to suspend write operations.
 1250  */
 1251 int
 1252 vfs_write_suspend(mp)
 1253         struct mount *mp;
 1254 {
 1255         int error;
 1256 
 1257         MNT_ILOCK(mp);
 1258         if (mp->mnt_susp_owner == curthread) {
 1259                 MNT_IUNLOCK(mp);
 1260                 return (EALREADY);
 1261         }
 1262         while (mp->mnt_kern_flag & MNTK_SUSPEND)
 1263                 msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 1264         mp->mnt_kern_flag |= MNTK_SUSPEND;
 1265         mp->mnt_susp_owner = curthread;
 1266         if (mp->mnt_writeopcount > 0)
 1267                 (void) msleep(&mp->mnt_writeopcount, 
 1268                     MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 1269         else
 1270                 MNT_IUNLOCK(mp);
 1271         if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
 1272                 vfs_write_resume(mp);
 1273         return (error);
 1274 }
 1275 
 1276 /*
 1277  * Request a filesystem to resume write operations.
 1278  */
 1279 void
 1280 vfs_write_resume(mp)
 1281         struct mount *mp;
 1282 {
 1283 
 1284         MNT_ILOCK(mp);
 1285         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 1286                 KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 1287                 mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 1288                                        MNTK_SUSPENDED);
 1289                 mp->mnt_susp_owner = NULL;
 1290                 wakeup(&mp->mnt_writeopcount);
 1291                 wakeup(&mp->mnt_flag);
 1292                 curthread->td_pflags &= ~TDP_IGNSUSP;
 1293                 MNT_IUNLOCK(mp);
 1294                 VFS_SUSP_CLEAN(mp);
 1295         } else
 1296                 MNT_IUNLOCK(mp);
 1297 }
 1298 
 1299 /*
 1300  * Implement kqueues for files by translating it to vnode operation.
 1301  */
 1302 static int
 1303 vn_kqfilter(struct file *fp, struct knote *kn)
 1304 {
 1305         int vfslocked;
 1306         int error;
 1307 
 1308         vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 1309         error = VOP_KQFILTER(fp->f_vnode, kn);
 1310         VFS_UNLOCK_GIANT(vfslocked);
 1311 
 1312         return error;
 1313 }
 1314 
 1315 /*
 1316  * Simplified in-kernel wrapper calls for extended attribute access.
 1317  * Both calls pass in a NULL credential, authorizing as "kernel" access.
 1318  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 1319  */
 1320 int
 1321 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 1322     const char *attrname, int *buflen, char *buf, struct thread *td)
 1323 {
 1324         struct uio      auio;
 1325         struct iovec    iov;
 1326         int     error;
 1327 
 1328         iov.iov_len = *buflen;
 1329         iov.iov_base = buf;
 1330 
 1331         auio.uio_iov = &iov;
 1332         auio.uio_iovcnt = 1;
 1333         auio.uio_rw = UIO_READ;
 1334         auio.uio_segflg = UIO_SYSSPACE;
 1335         auio.uio_td = td;
 1336         auio.uio_offset = 0;
 1337         auio.uio_resid = *buflen;
 1338 
 1339         if ((ioflg & IO_NODELOCKED) == 0)
 1340                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1341 
 1342         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1343 
 1344         /* authorize attribute retrieval as kernel */
 1345         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 1346             td);
 1347 
 1348         if ((ioflg & IO_NODELOCKED) == 0)
 1349                 VOP_UNLOCK(vp, 0);
 1350 
 1351         if (error == 0) {
 1352                 *buflen = *buflen - auio.uio_resid;
 1353         }
 1354 
 1355         return (error);
 1356 }
 1357 
 1358 /*
 1359  * XXX failure mode if partially written?
 1360  */
 1361 int
 1362 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 1363     const char *attrname, int buflen, char *buf, struct thread *td)
 1364 {
 1365         struct uio      auio;
 1366         struct iovec    iov;
 1367         struct mount    *mp;
 1368         int     error;
 1369 
 1370         iov.iov_len = buflen;
 1371         iov.iov_base = buf;
 1372 
 1373         auio.uio_iov = &iov;
 1374         auio.uio_iovcnt = 1;
 1375         auio.uio_rw = UIO_WRITE;
 1376         auio.uio_segflg = UIO_SYSSPACE;
 1377         auio.uio_td = td;
 1378         auio.uio_offset = 0;
 1379         auio.uio_resid = buflen;
 1380 
 1381         if ((ioflg & IO_NODELOCKED) == 0) {
 1382                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 1383                         return (error);
 1384                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1385         }
 1386 
 1387         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1388 
 1389         /* authorize attribute setting as kernel */
 1390         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 1391 
 1392         if ((ioflg & IO_NODELOCKED) == 0) {
 1393                 vn_finished_write(mp);
 1394                 VOP_UNLOCK(vp, 0);
 1395         }
 1396 
 1397         return (error);
 1398 }
 1399 
 1400 int
 1401 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 1402     const char *attrname, struct thread *td)
 1403 {
 1404         struct mount    *mp;
 1405         int     error;
 1406 
 1407         if ((ioflg & IO_NODELOCKED) == 0) {
 1408                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 1409                         return (error);
 1410                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 1411         }
 1412 
 1413         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1414 
 1415         /* authorize attribute removal as kernel */
 1416         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 1417         if (error == EOPNOTSUPP)
 1418                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 1419                     NULL, td);
 1420 
 1421         if ((ioflg & IO_NODELOCKED) == 0) {
 1422                 vn_finished_write(mp);
 1423                 VOP_UNLOCK(vp, 0);
 1424         }
 1425 
 1426         return (error);
 1427 }
 1428 
 1429 int
 1430 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 1431 {
 1432         struct mount *mp;
 1433         int ltype, error;
 1434 
 1435         mp = vp->v_mount;
 1436         ltype = VOP_ISLOCKED(vp);
 1437         KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 1438             ("vn_vget_ino: vp not locked"));
 1439         error = vfs_busy(mp, MBF_NOWAIT);
 1440         if (error != 0) {
 1441                 vfs_ref(mp);
 1442                 VOP_UNLOCK(vp, 0);
 1443                 error = vfs_busy(mp, 0);
 1444                 vn_lock(vp, ltype | LK_RETRY);
 1445                 vfs_rel(mp);
 1446                 if (error != 0)
 1447                         return (ENOENT);
 1448                 if (vp->v_iflag & VI_DOOMED) {
 1449                         vfs_unbusy(mp);
 1450                         return (ENOENT);
 1451                 }
 1452         }
 1453         VOP_UNLOCK(vp, 0);
 1454         error = VFS_VGET(mp, ino, lkflags, rvp);
 1455         vfs_unbusy(mp);
 1456         vn_lock(vp, ltype | LK_RETRY);
 1457         if (vp->v_iflag & VI_DOOMED) {
 1458                 if (error == 0)
 1459                         vput(*rvp);
 1460                 error = ENOENT;
 1461         }
 1462         return (error);
 1463 }
 1464 
 1465 int
 1466 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio, const struct thread *td)
 1467 {
 1468         if (vp->v_type != VREG || td == NULL)
 1469                 return (0);
 1470 
 1471         PROC_LOCK(td->td_proc);
 1472         if (uio->uio_offset + uio->uio_resid >
 1473             lim_cur(td->td_proc, RLIMIT_FSIZE)) {
 1474                 psignal(td->td_proc, SIGXFSZ);
 1475                 PROC_UNLOCK(td->td_proc);
 1476                 return (EFBIG);
 1477         }
 1478         PROC_UNLOCK(td->td_proc);
 1479 
 1480         return (0);
 1481 }
 1482 
 1483 void
 1484 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 1485 {
 1486         vm_object_t object;
 1487 
 1488         if ((object = vp->v_object) == NULL)
 1489                 return;
 1490         VM_OBJECT_LOCK(object);
 1491         vm_object_page_remove(object, start, end, 0);
 1492         VM_OBJECT_UNLOCK(object);
 1493 }
 1494 
 1495 int
 1496 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 1497 {
 1498         struct vattr va;
 1499         daddr_t bn, bnp;
 1500         uint64_t bsize;
 1501         off_t noff;
 1502         int error;
 1503 
 1504         KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 1505             ("Wrong command %lu", cmd));
 1506 
 1507         if (vn_lock(vp, LK_SHARED) != 0)
 1508                 return (EBADF);
 1509         if (vp->v_type != VREG) {
 1510                 error = ENOTTY;
 1511                 goto unlock;
 1512         }
 1513         error = VOP_GETATTR(vp, &va, cred);
 1514         if (error != 0)
 1515                 goto unlock;
 1516         noff = *off;
 1517         if (noff >= va.va_size) {
 1518                 error = ENXIO;
 1519                 goto unlock;
 1520         }
 1521         bsize = vp->v_mount->mnt_stat.f_iosize;
 1522         for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize) {
 1523                 error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 1524                 if (error == EOPNOTSUPP) {
 1525                         error = ENOTTY;
 1526                         goto unlock;
 1527                 }
 1528                 if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 1529                     (bnp != -1 && cmd == FIOSEEKDATA)) {
 1530                         noff = bn * bsize;
 1531                         if (noff < *off)
 1532                                 noff = *off;
 1533                         goto unlock;
 1534                 }
 1535         }
 1536         if (noff > va.va_size)
 1537                 noff = va.va_size;
 1538         /* noff == va.va_size. There is an implicit hole at the end of file. */
 1539         if (cmd == FIOSEEKDATA)
 1540                 error = ENXIO;
 1541 unlock:
 1542         VOP_UNLOCK(vp, 0);
 1543         if (error == 0)
 1544                 *off = noff;
 1545         return (error);
 1546 }
Cache object: 101004e45169e1e63204a10ab1028888
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c