vfs_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1982, 1986, 1989, 1993
    3  *      The Regents of the University of California.  All rights reserved.
    4  * (c) UNIX System Laboratories, Inc.
    5  * All or some portions of this file are derived from material licensed
    6  * to the University of California by American Telephone and Telegraph
    7  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
    8  * the permission of UNIX System Laboratories, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  * 1. Redistributions of source code must retain the above copyright
   14  *    notice, this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright
   16  *    notice, this list of conditions and the following disclaimer in the
   17  *    documentation and/or other materials provided with the distribution.
   18  * 4. Neither the name of the University nor the names of its contributors
   19  *    may be used to endorse or promote products derived from this software
   20  *    without specific prior written permission.
   21  *
   22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  *
   34  *      @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
   35  */
   36 
   37 #include <sys/cdefs.h>
   38 __FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.207.2.4 2005/05/01 01:26:25 csjp Exp $");
   39 
   40 #include "opt_mac.h"
   41 
   42 #include <sys/param.h>
   43 #include <sys/systm.h>
   44 #include <sys/fcntl.h>
   45 #include <sys/file.h>
   46 #include <sys/kdb.h>
   47 #include <sys/stat.h>
   48 #include <sys/proc.h>
   49 #include <sys/limits.h>
   50 #include <sys/lock.h>
   51 #include <sys/mac.h>
   52 #include <sys/mount.h>
   53 #include <sys/mutex.h>
   54 #include <sys/namei.h>
   55 #include <sys/vnode.h>
   56 #include <sys/bio.h>
   57 #include <sys/buf.h>
   58 #include <sys/filio.h>
   59 #include <sys/sx.h>
   60 #include <sys/ttycom.h>
   61 #include <sys/conf.h>
   62 #include <sys/syslog.h>
   63 #include <sys/unistd.h>
   64 
   65 static fo_rdwr_t        vn_read;
   66 static fo_rdwr_t        vn_write;
   67 static fo_ioctl_t       vn_ioctl;
   68 static fo_poll_t        vn_poll;
   69 static fo_kqfilter_t    vn_kqfilter;
   70 static fo_stat_t        vn_statfile;
   71 static fo_close_t       vn_closefile;
   72 
   73 struct  fileops vnops = {
   74         .fo_read = vn_read,
   75         .fo_write = vn_write,
   76         .fo_ioctl = vn_ioctl,
   77         .fo_poll = vn_poll,
   78         .fo_kqfilter = vn_kqfilter,
   79         .fo_stat = vn_statfile,
   80         .fo_close = vn_closefile,
   81         .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
   82 };
   83 
   84 int
   85 vn_open(ndp, flagp, cmode, fdidx)
   86         struct nameidata *ndp;
   87         int *flagp, cmode, fdidx;
   88 {
   89         struct thread *td = ndp->ni_cnd.cn_thread;
   90 
   91         return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
   92 }
   93 
   94 /*
   95  * Common code for vnode open operations.
   96  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
   97  * 
   98  * Note that this does NOT free nameidata for the successful case,
   99  * due to the NDINIT being done elsewhere.
  100  */
  101 int
  102 vn_open_cred(ndp, flagp, cmode, cred, fdidx)
  103         struct nameidata *ndp;
  104         int *flagp, cmode;
  105         struct ucred *cred;
  106         int fdidx;
  107 {
  108         struct vnode *vp;
  109         struct mount *mp;
  110         struct thread *td = ndp->ni_cnd.cn_thread;
  111         struct vattr vat;
  112         struct vattr *vap = &vat;
  113         int mode, fmode, error;
  114 #ifdef LOOKUP_SHARED
  115         int exclusive;  /* The current intended lock state */
  116 
  117         exclusive = 0;
  118 #endif
  119 
  120         GIANT_REQUIRED;
  121 
  122 restart:
  123         fmode = *flagp;
  124         if (fmode & O_CREAT) {
  125                 ndp->ni_cnd.cn_nameiop = CREATE;
  126                 ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
  127                 if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
  128                         ndp->ni_cnd.cn_flags |= FOLLOW;
  129                 bwillwrite();
  130                 if ((error = namei(ndp)) != 0)
  131                         return (error);
  132                 if (ndp->ni_vp == NULL) {
  133                         VATTR_NULL(vap);
  134                         vap->va_type = VREG;
  135                         vap->va_mode = cmode;
  136                         if (fmode & O_EXCL)
  137                                 vap->va_vaflags |= VA_EXCLUSIVE;
  138                         if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
  139                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  140                                 vput(ndp->ni_dvp);
  141                                 if ((error = vn_start_write(NULL, &mp,
  142                                     V_XSLEEP | PCATCH)) != 0)
  143                                         return (error);
  144                                 goto restart;
  145                         }
  146 #ifdef MAC
  147                         error = mac_check_vnode_create(cred, ndp->ni_dvp,
  148                             &ndp->ni_cnd, vap);
  149                         if (error == 0) {
  150 #endif
  151                                 VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
  152                                 error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
  153                                                    &ndp->ni_cnd, vap);
  154 #ifdef MAC
  155                         }
  156 #endif
  157                         vput(ndp->ni_dvp);
  158                         vn_finished_write(mp);
  159                         if (error) {
  160                                 NDFREE(ndp, NDF_ONLY_PNBUF);
  161                                 return (error);
  162                         }
  163                         ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
  164                         ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
  165                         fmode &= ~O_TRUNC;
  166                         vp = ndp->ni_vp;
  167 #ifdef LOOKUP_SHARED
  168                         exclusive = 1;
  169 #endif
  170                 } else {
  171                         if (ndp->ni_dvp == ndp->ni_vp)
  172                                 vrele(ndp->ni_dvp);
  173                         else
  174                                 vput(ndp->ni_dvp);
  175                         ndp->ni_dvp = NULL;
  176                         vp = ndp->ni_vp;
  177                         if (fmode & O_EXCL) {
  178                                 error = EEXIST;
  179                                 goto bad;
  180                         }
  181                         fmode &= ~O_CREAT;
  182                 }
  183         } else {
  184                 ndp->ni_cnd.cn_nameiop = LOOKUP;
  185 #ifdef LOOKUP_SHARED
  186                 ndp->ni_cnd.cn_flags =
  187                     ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
  188                     LOCKSHARED | LOCKLEAF;
  189 #else
  190                 ndp->ni_cnd.cn_flags =
  191                     ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
  192 #endif
  193                 if ((error = namei(ndp)) != 0)
  194                         return (error);
  195                 vp = ndp->ni_vp;
  196         }
  197         if (vp->v_type == VLNK) {
  198                 error = EMLINK;
  199                 goto bad;
  200         }
  201         if (vp->v_type == VSOCK) {
  202                 error = EOPNOTSUPP;
  203                 goto bad;
  204         }
  205         mode = 0;
  206         if (fmode & (FWRITE | O_TRUNC)) {
  207                 if (vp->v_type == VDIR) {
  208                         error = EISDIR;
  209                         goto bad;
  210                 }
  211                 mode |= VWRITE;
  212         }
  213         if (fmode & FREAD)
  214                 mode |= VREAD;
  215         if (fmode & O_APPEND)
  216                 mode |= VAPPEND;
  217 #ifdef MAC
  218         error = mac_check_vnode_open(cred, vp, mode);
  219         if (error)
  220                 goto bad;
  221 #endif
  222         if ((fmode & O_CREAT) == 0) {
  223                 if (mode & VWRITE) {
  224                         error = vn_writechk(vp);
  225                         if (error)
  226                                 goto bad;
  227                 }
  228                 if (mode) {
  229                         error = VOP_ACCESS(vp, mode, cred, td);
  230                         if (error)
  231                                 goto bad;
  232                 }
  233         }
  234         if ((error = VOP_GETATTR(vp, vap, cred, td)) == 0) {
  235                 vp->v_cachedfs = vap->va_fsid;
  236                 vp->v_cachedid = vap->va_fileid;
  237         }
  238         if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
  239                 goto bad;
  240         /*
  241          * Make sure that a VM object is created for VMIO support.
  242          */
  243         if (vn_canvmio(vp) == TRUE) {
  244 #ifdef LOOKUP_SHARED
  245                 int flock;
  246 
  247                 if (!exclusive && VOP_GETVOBJECT(vp, NULL) != 0)
  248                         VOP_LOCK(vp, LK_UPGRADE, td);
  249                 /*
  250                  * In cases where the object is marked as dead object_create
  251                  * will unlock and relock exclusive.  It is safe to call in
  252                  * here with a shared lock because we only examine fields that
  253                  * the shared lock guarantees will be stable.  In the UPGRADE
  254                  * case it is not likely that anyone has used this vnode yet
  255                  * so there will be no contention.  The logic after this call
  256                  * restores the requested locking state.
  257                  */
  258 #endif
  259                 if ((error = vfs_object_create(vp, td, cred)) != 0) {
  260                         VOP_UNLOCK(vp, 0, td);
  261                         VOP_CLOSE(vp, fmode, cred, td);
  262                         NDFREE(ndp, NDF_ONLY_PNBUF);
  263                         vrele(vp);
  264                         *flagp = fmode;
  265                         return (error);
  266                 }
  267 #ifdef LOOKUP_SHARED
  268                 flock = VOP_ISLOCKED(vp, td);
  269                 if (!exclusive && flock == LK_EXCLUSIVE)
  270                         VOP_LOCK(vp, LK_DOWNGRADE, td);
  271 #endif
  272         }
  273 
  274         if (fmode & FWRITE)
  275                 vp->v_writecount++;
  276         *flagp = fmode;
  277         ASSERT_VOP_LOCKED(vp, "vn_open_cred");
  278         return (0);
  279 bad:
  280         NDFREE(ndp, NDF_ONLY_PNBUF);
  281         vput(vp);
  282         *flagp = fmode;
  283         ndp->ni_vp = NULL;
  284         return (error);
  285 }
  286 
  287 /*
  288  * Check for write permissions on the specified vnode.
  289  * Prototype text segments cannot be written.
  290  */
  291 int
  292 vn_writechk(vp)
  293         register struct vnode *vp;
  294 {
  295 
  296         ASSERT_VOP_LOCKED(vp, "vn_writechk");
  297         /*
  298          * If there's shared text associated with
  299          * the vnode, try to free it up once.  If
  300          * we fail, we can't allow writing.
  301          */
  302         if (vp->v_vflag & VV_TEXT)
  303                 return (ETXTBSY);
  304 
  305         return (0);
  306 }
  307 
  308 /*
  309  * Vnode close call
  310  */
  311 int
  312 vn_close(vp, flags, file_cred, td)
  313         register struct vnode *vp;
  314         int flags;
  315         struct ucred *file_cred;
  316         struct thread *td;
  317 {
  318         int error;
  319 
  320         GIANT_REQUIRED;
  321 
  322         if (flags & FWRITE)
  323                 vp->v_writecount--;
  324         error = VOP_CLOSE(vp, flags, file_cred, td);
  325         /*
  326          * XXX - In certain instances VOP_CLOSE has to do the vrele
  327          * itself. If the vrele has been done, it will return EAGAIN
  328          * to indicate that the vrele should not be done again. When
  329          * this happens, we just return success. The correct thing to
  330          * do would be to have all VOP_CLOSE instances do the vrele.
  331          */
  332         if (error == EAGAIN)
  333                 return (0);
  334         vrele(vp);
  335         return (error);
  336 }
  337 
  338 /*
  339  * Sequential heuristic - detect sequential operation
  340  */
  341 static __inline
  342 int
  343 sequential_heuristic(struct uio *uio, struct file *fp)
  344 {
  345 
  346         if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
  347             uio->uio_offset == fp->f_nextoff) {
  348                 /*
  349                  * XXX we assume that the filesystem block size is
  350                  * the default.  Not true, but still gives us a pretty
  351                  * good indicator of how sequential the read operations
  352                  * are.
  353                  */
  354                 fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
  355                 if (fp->f_seqcount > IO_SEQMAX)
  356                         fp->f_seqcount = IO_SEQMAX;
  357                 return(fp->f_seqcount << IO_SEQSHIFT);
  358         }
  359 
  360         /*
  361          * Not sequential, quick draw-down of seqcount
  362          */
  363         if (fp->f_seqcount > 1)
  364                 fp->f_seqcount = 1;
  365         else
  366                 fp->f_seqcount = 0;
  367         return(0);
  368 }
  369 
  370 /*
  371  * Package up an I/O request on a vnode into a uio and do it.
  372  */
  373 int
  374 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
  375     aresid, td)
  376         enum uio_rw rw;
  377         struct vnode *vp;
  378         caddr_t base;
  379         int len;
  380         off_t offset;
  381         enum uio_seg segflg;
  382         int ioflg;
  383         struct ucred *active_cred;
  384         struct ucred *file_cred;
  385         int *aresid;
  386         struct thread *td;
  387 {
  388         struct uio auio;
  389         struct iovec aiov;
  390         struct mount *mp;
  391         struct ucred *cred;
  392         int error;
  393 
  394         GIANT_REQUIRED;
  395 
  396         if ((ioflg & IO_NODELOCKED) == 0) {
  397                 mp = NULL;
  398                 if (rw == UIO_WRITE) { 
  399                         if (vp->v_type != VCHR &&
  400                             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
  401                             != 0)
  402                                 return (error);
  403                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  404                 } else {
  405                         /*
  406                          * XXX This should be LK_SHARED but I don't trust VFS
  407                          * enough to leave it like that until it has been
  408                          * reviewed further.
  409                          */
  410                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  411                 }
  412 
  413         }
  414         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
  415         auio.uio_iov = &aiov;
  416         auio.uio_iovcnt = 1;
  417         aiov.iov_base = base;
  418         aiov.iov_len = len;
  419         auio.uio_resid = len;
  420         auio.uio_offset = offset;
  421         auio.uio_segflg = segflg;
  422         auio.uio_rw = rw;
  423         auio.uio_td = td;
  424         error = 0;
  425 #ifdef MAC
  426         if ((ioflg & IO_NOMACCHECK) == 0) {
  427                 if (rw == UIO_READ)
  428                         error = mac_check_vnode_read(active_cred, file_cred,
  429                             vp);
  430                 else
  431                         error = mac_check_vnode_write(active_cred, file_cred,
  432                             vp);
  433         }
  434 #endif
  435         if (error == 0) {
  436                 if (file_cred)
  437                         cred = file_cred;
  438                 else
  439                         cred = active_cred;
  440                 if (rw == UIO_READ)
  441                         error = VOP_READ(vp, &auio, ioflg, cred);
  442                 else
  443                         error = VOP_WRITE(vp, &auio, ioflg, cred);
  444         }
  445         if (aresid)
  446                 *aresid = auio.uio_resid;
  447         else
  448                 if (auio.uio_resid && error == 0)
  449                         error = EIO;
  450         if ((ioflg & IO_NODELOCKED) == 0) {
  451                 if (rw == UIO_WRITE)
  452                         vn_finished_write(mp);
  453                 VOP_UNLOCK(vp, 0, td);
  454         }
  455         return (error);
  456 }
  457 
  458 /*
  459  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  460  * request is split up into smaller chunks and we try to avoid saturating
  461  * the buffer cache while potentially holding a vnode locked, so we 
  462  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
  463  * to give other processes a chance to lock the vnode (either other processes
  464  * core'ing the same binary, or unrelated processes scanning the directory).
  465  */
  466 int
  467 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
  468     file_cred, aresid, td)
  469         enum uio_rw rw;
  470         struct vnode *vp;
  471         caddr_t base;
  472         size_t len;
  473         off_t offset;
  474         enum uio_seg segflg;
  475         int ioflg;
  476         struct ucred *active_cred;
  477         struct ucred *file_cred;
  478         size_t *aresid;
  479         struct thread *td;
  480 {
  481         int error = 0;
  482         int iaresid;
  483 
  484         GIANT_REQUIRED;
  485 
  486         do {
  487                 int chunk;
  488 
  489                 /*
  490                  * Force `offset' to a multiple of MAXBSIZE except possibly
  491                  * for the first chunk, so that filesystems only need to
  492                  * write full blocks except possibly for the first and last
  493                  * chunks.
  494                  */
  495                 chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
  496 
  497                 if (chunk > len)
  498                         chunk = len;
  499                 if (rw != UIO_READ && vp->v_type == VREG)
  500                         bwillwrite();
  501                 iaresid = 0;
  502                 error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
  503                     ioflg, active_cred, file_cred, &iaresid, td);
  504                 len -= chunk;   /* aresid calc already includes length */
  505                 if (error)
  506                         break;
  507                 offset += chunk;
  508                 base += chunk;
  509                 uio_yield();
  510         } while (len);
  511         if (aresid)
  512                 *aresid = len + iaresid;
  513         return (error);
  514 }
  515 
  516 /*
  517  * File table vnode read routine.
  518  */
  519 static int
  520 vn_read(fp, uio, active_cred, flags, td)
  521         struct file *fp;
  522         struct uio *uio;
  523         struct ucred *active_cred;
  524         struct thread *td;
  525         int flags;
  526 {
  527         struct vnode *vp;
  528         int error, ioflag;
  529 
  530         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  531             uio->uio_td, td));
  532         vp = fp->f_vnode;
  533         ioflag = 0;
  534         if (fp->f_flag & FNONBLOCK)
  535                 ioflag |= IO_NDELAY;
  536         if (fp->f_flag & O_DIRECT)
  537                 ioflag |= IO_DIRECT;
  538         mtx_lock(&Giant);
  539         VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
  540         /*
  541          * According to McKusick the vn lock is protecting f_offset here.
  542          * Once this field has it's own lock we can acquire this shared.
  543          */
  544         if ((flags & FOF_OFFSET) == 0) {
  545                 vn_lock(vp, LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td);
  546                 uio->uio_offset = fp->f_offset;
  547         } else
  548                 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
  549 
  550         ioflag |= sequential_heuristic(uio, fp);
  551 
  552 #ifdef MAC
  553         error = mac_check_vnode_read(active_cred, fp->f_cred, vp);
  554         if (error == 0)
  555 #endif
  556                 error = VOP_READ(vp, uio, ioflag, fp->f_cred);
  557         if ((flags & FOF_OFFSET) == 0)
  558                 fp->f_offset = uio->uio_offset;
  559         fp->f_nextoff = uio->uio_offset;
  560         VOP_UNLOCK(vp, 0, td);
  561         mtx_unlock(&Giant);
  562         return (error);
  563 }
  564 
  565 /*
  566  * File table vnode write routine.
  567  */
  568 static int
  569 vn_write(fp, uio, active_cred, flags, td)
  570         struct file *fp;
  571         struct uio *uio;
  572         struct ucred *active_cred;
  573         struct thread *td;
  574         int flags;
  575 {
  576         struct vnode *vp;
  577         struct mount *mp;
  578         int error, ioflag;
  579 
  580         KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
  581             uio->uio_td, td));
  582         vp = fp->f_vnode;
  583         mtx_lock(&Giant);
  584         if (vp->v_type == VREG)
  585                 bwillwrite();
  586         ioflag = IO_UNIT;
  587         if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
  588                 ioflag |= IO_APPEND;
  589         if (fp->f_flag & FNONBLOCK)
  590                 ioflag |= IO_NDELAY;
  591         if (fp->f_flag & O_DIRECT)
  592                 ioflag |= IO_DIRECT;
  593         if ((fp->f_flag & O_FSYNC) ||
  594             (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
  595                 ioflag |= IO_SYNC;
  596         mp = NULL;
  597         if (vp->v_type != VCHR &&
  598             (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
  599                 mtx_unlock(&Giant);
  600                 return (error);
  601         }
  602         VOP_LEASE(vp, td, fp->f_cred, LEASE_WRITE);
  603         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  604         if ((flags & FOF_OFFSET) == 0)
  605                 uio->uio_offset = fp->f_offset;
  606         ioflag |= sequential_heuristic(uio, fp);
  607 #ifdef MAC
  608         error = mac_check_vnode_write(active_cred, fp->f_cred, vp);
  609         if (error == 0)
  610 #endif
  611                 error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
  612         if ((flags & FOF_OFFSET) == 0)
  613                 fp->f_offset = uio->uio_offset;
  614         fp->f_nextoff = uio->uio_offset;
  615         VOP_UNLOCK(vp, 0, td);
  616         vn_finished_write(mp);
  617         mtx_unlock(&Giant);
  618         return (error);
  619 }
  620 
  621 /*
  622  * File table vnode stat routine.
  623  */
  624 static int
  625 vn_statfile(fp, sb, active_cred, td)
  626         struct file *fp;
  627         struct stat *sb;
  628         struct ucred *active_cred;
  629         struct thread *td;
  630 {
  631         struct vnode *vp = fp->f_vnode;
  632         int error;
  633 
  634         mtx_lock(&Giant);
  635         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  636         error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
  637         VOP_UNLOCK(vp, 0, td);
  638         mtx_unlock(&Giant);
  639 
  640         return (error);
  641 }
  642 
  643 /*
  644  * Stat a vnode; implementation for the stat syscall
  645  */
  646 int
  647 vn_stat(vp, sb, active_cred, file_cred, td)
  648         struct vnode *vp;
  649         register struct stat *sb;
  650         struct ucred *active_cred;
  651         struct ucred *file_cred;
  652         struct thread *td;
  653 {
  654         struct vattr vattr;
  655         register struct vattr *vap;
  656         int error;
  657         u_short mode;
  658 
  659         GIANT_REQUIRED;
  660 
  661 #ifdef MAC
  662         error = mac_check_vnode_stat(active_cred, file_cred, vp);
  663         if (error)
  664                 return (error);
  665 #endif
  666 
  667         vap = &vattr;
  668         error = VOP_GETATTR(vp, vap, active_cred, td);
  669         if (error)
  670                 return (error);
  671 
  672         vp->v_cachedfs = vap->va_fsid;
  673         vp->v_cachedid = vap->va_fileid;
  674 
  675         /*
  676          * Zero the spare stat fields
  677          */
  678         bzero(sb, sizeof *sb);
  679 
  680         /*
  681          * Copy from vattr table
  682          */
  683         if (vap->va_fsid != VNOVAL)
  684                 sb->st_dev = vap->va_fsid;
  685         else
  686                 sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
  687         sb->st_ino = vap->va_fileid;
  688         mode = vap->va_mode;
  689         switch (vap->va_type) {
  690         case VREG:
  691                 mode |= S_IFREG;
  692                 break;
  693         case VDIR:
  694                 mode |= S_IFDIR;
  695                 break;
  696         case VBLK:
  697                 mode |= S_IFBLK;
  698                 break;
  699         case VCHR:
  700                 mode |= S_IFCHR;
  701                 break;
  702         case VLNK:
  703                 mode |= S_IFLNK;
  704                 /* This is a cosmetic change, symlinks do not have a mode. */
  705                 if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
  706                         sb->st_mode &= ~ACCESSPERMS;    /* 0000 */
  707                 else
  708                         sb->st_mode |= ACCESSPERMS;     /* 0777 */
  709                 break;
  710         case VSOCK:
  711                 mode |= S_IFSOCK;
  712                 break;
  713         case VFIFO:
  714                 mode |= S_IFIFO;
  715                 break;
  716         default:
  717                 return (EBADF);
  718         };
  719         sb->st_mode = mode;
  720         sb->st_nlink = vap->va_nlink;
  721         sb->st_uid = vap->va_uid;
  722         sb->st_gid = vap->va_gid;
  723         sb->st_rdev = vap->va_rdev;
  724         if (vap->va_size > OFF_MAX)
  725                 return (EOVERFLOW);
  726         sb->st_size = vap->va_size;
  727         sb->st_atimespec = vap->va_atime;
  728         sb->st_mtimespec = vap->va_mtime;
  729         sb->st_ctimespec = vap->va_ctime;
  730         sb->st_birthtimespec = vap->va_birthtime;
  731 
  732         /*
  733          * According to www.opengroup.org, the meaning of st_blksize is 
  734          *   "a filesystem-specific preferred I/O block size for this 
  735          *    object.  In some filesystem types, this may vary from file
  736          *    to file"
  737          * Default to PAGE_SIZE after much discussion.
  738          */
  739 
  740         if (vap->va_type == VREG) {
  741                 sb->st_blksize = vap->va_blocksize;
  742         } else if (vn_isdisk(vp, NULL)) {
  743                 sb->st_blksize = vp->v_rdev->si_bsize_best;
  744                 if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
  745                         sb->st_blksize = vp->v_rdev->si_bsize_phys;
  746                 if (sb->st_blksize < BLKDEV_IOSIZE)
  747                         sb->st_blksize = BLKDEV_IOSIZE;
  748         } else {
  749                 sb->st_blksize = PAGE_SIZE;
  750         }
  751         
  752         sb->st_flags = vap->va_flags;
  753         if (suser(td))
  754                 sb->st_gen = 0;
  755         else
  756                 sb->st_gen = vap->va_gen;
  757 
  758 #if (S_BLKSIZE == 512)
  759         /* Optimize this case */
  760         sb->st_blocks = vap->va_bytes >> 9;
  761 #else
  762         sb->st_blocks = vap->va_bytes / S_BLKSIZE;
  763 #endif
  764         return (0);
  765 }
  766 
  767 /*
  768  * File table vnode ioctl routine.
  769  */
  770 static int
  771 vn_ioctl(fp, com, data, active_cred, td)
  772         struct file *fp;
  773         u_long com;
  774         void *data;
  775         struct ucred *active_cred;
  776         struct thread *td;
  777 {
  778         struct vnode *vp = fp->f_vnode;
  779         struct vattr vattr;
  780         int error;
  781 
  782         mtx_lock(&Giant);
  783         error = ENOTTY;
  784         switch (vp->v_type) {
  785         case VREG:
  786         case VDIR:
  787                 if (com == FIONREAD) {
  788                         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  789                         error = VOP_GETATTR(vp, &vattr, active_cred, td);
  790                         VOP_UNLOCK(vp, 0, td);
  791                         if (!error)
  792                                 *(int *)data = vattr.va_size - fp->f_offset;
  793                 }
  794                 if (com == FIONBIO || com == FIOASYNC)  /* XXX */
  795                         error = 0;
  796                 else
  797                         error = VOP_IOCTL(vp, com, data, fp->f_flag,
  798                             active_cred, td);
  799                 break;
  800 
  801         default:
  802 #if 0
  803                 break;
  804 #endif
  805         case VFIFO:
  806         case VCHR:
  807         case VBLK:
  808                 if (com == FIODTYPE) {
  809                         if (vp->v_type != VCHR && vp->v_type != VBLK)
  810                                 break;
  811                         *(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
  812                         error = 0;
  813                         break;
  814                 }
  815                 error = VOP_IOCTL(vp, com, data, fp->f_flag, active_cred, td);
  816                 if (error == ENOIOCTL) {
  817 #ifdef DIAGNOSTIC
  818                         kdb_enter("ENOIOCTL leaked through");
  819 #endif
  820                         error = ENOTTY;
  821                 }
  822                 if (error == 0 && com == TIOCSCTTY) {
  823                         struct vnode *vpold;
  824 
  825                         /* Do nothing if reassigning same control tty */
  826                         sx_slock(&proctree_lock);
  827                         if (td->td_proc->p_session->s_ttyvp == vp) {
  828                                 sx_sunlock(&proctree_lock);
  829                                 error = 0;
  830                                 break;
  831                         }
  832 
  833                         vpold = td->td_proc->p_session->s_ttyvp;
  834                         VREF(vp);
  835                         SESS_LOCK(td->td_proc->p_session);
  836                         td->td_proc->p_session->s_ttyvp = vp;
  837                         SESS_UNLOCK(td->td_proc->p_session);
  838 
  839                         sx_sunlock(&proctree_lock);
  840 
  841                         /* Get rid of reference to old control tty */
  842                         if (vpold)
  843                                 vrele(vpold);
  844                 }
  845                 break;
  846         }
  847         mtx_unlock(&Giant);
  848         return (error);
  849 }
  850 
  851 /*
  852  * File table vnode poll routine.
  853  */
  854 static int
  855 vn_poll(fp, events, active_cred, td)
  856         struct file *fp;
  857         int events;
  858         struct ucred *active_cred;
  859         struct thread *td;
  860 {
  861         struct vnode *vp;
  862         int error;
  863 
  864         mtx_lock(&Giant);
  865 
  866         vp = fp->f_vnode;
  867 #ifdef MAC
  868         vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
  869         error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
  870         VOP_UNLOCK(vp, 0, td);
  871         if (!error)
  872 #endif
  873 
  874         error = VOP_POLL(vp, events, fp->f_cred, td);
  875         mtx_unlock(&Giant);
  876         return (error);
  877 }
  878 
  879 /*
  880  * Check that the vnode is still valid, and if so
  881  * acquire requested lock.
  882  */
  883 int
  884 #ifndef DEBUG_LOCKS
  885 vn_lock(vp, flags, td)
  886 #else
  887 debug_vn_lock(vp, flags, td, filename, line)
  888 #endif
  889         struct vnode *vp;
  890         int flags;
  891         struct thread *td;
  892 #ifdef  DEBUG_LOCKS
  893         const char *filename;
  894         int line;
  895 #endif
  896 {
  897         int error;
  898 
  899         do {
  900                 if ((flags & LK_INTERLOCK) == 0)
  901                         VI_LOCK(vp);
  902                 if ((vp->v_iflag & VI_XLOCK) && vp->v_vxthread != curthread) {
  903                         if ((flags & LK_NOWAIT) != 0) {
  904                                 VI_UNLOCK(vp);
  905                                 return (ENOENT);
  906                         }
  907                         vp->v_iflag |= VI_XWANT;
  908                         msleep(vp, VI_MTX(vp), PINOD, "vn_lock", 0);
  909                         if ((flags & LK_RETRY) == 0) {
  910                                 VI_UNLOCK(vp);
  911                                 return (ENOENT);
  912                         }
  913                 } 
  914 #ifdef  DEBUG_LOCKS
  915                 vp->filename = filename;
  916                 vp->line = line;
  917 #endif
  918                 /*
  919                  * lockmgr drops interlock before it will return for
  920                  * any reason.  So force the code above to relock it.
  921                  */
  922                 error = VOP_LOCK(vp, flags | LK_NOPAUSE | LK_INTERLOCK, td);
  923                 flags &= ~LK_INTERLOCK;
  924         } while (flags & LK_RETRY && error != 0);
  925         return (error);
  926 }
  927 
  928 /*
  929  * File table vnode close routine.
  930  */
  931 static int
  932 vn_closefile(fp, td)
  933         struct file *fp;
  934         struct thread *td;
  935 {
  936         struct vnode *vp;
  937         struct flock lf;
  938         int error;
  939 
  940         vp = fp->f_vnode;
  941 
  942         mtx_lock(&Giant);
  943         if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
  944                 lf.l_whence = SEEK_SET;
  945                 lf.l_start = 0;
  946                 lf.l_len = 0;
  947                 lf.l_type = F_UNLCK;
  948                 (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
  949         }
  950 
  951         fp->f_ops = &badfileops;
  952 
  953         error = vn_close(vp, fp->f_flag, fp->f_cred, td);
  954         mtx_unlock(&Giant);
  955         return (error);
  956 }
  957 
  958 /*
  959  * Preparing to start a filesystem write operation. If the operation is
  960  * permitted, then we bump the count of operations in progress and
  961  * proceed. If a suspend request is in progress, we wait until the
  962  * suspension is over, and then proceed.
  963  */
  964 int
  965 vn_start_write(vp, mpp, flags)
  966         struct vnode *vp;
  967         struct mount **mpp;
  968         int flags;
  969 {
  970         struct mount *mp;
  971         int error;
  972 
  973         GIANT_REQUIRED;
  974 
  975         /*
  976          * If a vnode is provided, get and return the mount point that
  977          * to which it will write.
  978          */
  979         if (vp != NULL) {
  980                 if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
  981                         *mpp = NULL;
  982                         if (error != EOPNOTSUPP)
  983                                 return (error);
  984                         return (0);
  985                 }
  986         }
  987         if ((mp = *mpp) == NULL)
  988                 return (0);
  989         /*
  990          * Check on status of suspension.
  991          */
  992         while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
  993                 if (flags & V_NOWAIT)
  994                         return (EWOULDBLOCK);
  995                 error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
  996                     "suspfs", 0);
  997                 if (error)
  998                         return (error);
  999         }
 1000         if (flags & V_XSLEEP)
 1001                 return (0);
 1002         mp->mnt_writeopcount++;
 1003         return (0);
 1004 }
 1005 
 1006 /*
 1007  * Secondary suspension. Used by operations such as vop_inactive
 1008  * routines that are needed by the higher level functions. These
 1009  * are allowed to proceed until all the higher level functions have
 1010  * completed (indicated by mnt_writeopcount dropping to zero). At that
 1011  * time, these operations are halted until the suspension is over.
 1012  */
 1013 int
 1014 vn_write_suspend_wait(vp, mp, flags)
 1015         struct vnode *vp;
 1016         struct mount *mp;
 1017         int flags;
 1018 {
 1019         int error;
 1020 
 1021         GIANT_REQUIRED;
 1022 
 1023         if (vp != NULL) {
 1024                 if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
 1025                         if (error != EOPNOTSUPP)
 1026                                 return (error);
 1027                         return (0);
 1028                 }
 1029         }
 1030         /*
 1031          * If we are not suspended or have not yet reached suspended
 1032          * mode, then let the operation proceed.
 1033          */
 1034         if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
 1035                 return (0);
 1036         if (flags & V_NOWAIT)
 1037                 return (EWOULDBLOCK);
 1038         /*
 1039          * Wait for the suspension to finish.
 1040          */
 1041         return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
 1042             "suspfs", 0));
 1043 }
 1044 
 1045 /*
 1046  * Filesystem write operation has completed. If we are suspending and this
 1047  * operation is the last one, notify the suspender that the suspension is
 1048  * now in effect.
 1049  */
 1050 void
 1051 vn_finished_write(mp)
 1052         struct mount *mp;
 1053 {
 1054 
 1055         GIANT_REQUIRED;
 1056 
 1057         if (mp == NULL)
 1058                 return;
 1059         mp->mnt_writeopcount--;
 1060         if (mp->mnt_writeopcount < 0)
 1061                 panic("vn_finished_write: neg cnt");
 1062         if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 1063             mp->mnt_writeopcount <= 0)
 1064                 wakeup(&mp->mnt_writeopcount);
 1065 }
 1066 
 1067 /*
 1068  * Request a filesystem to suspend write operations.
 1069  */
 1070 int
 1071 vfs_write_suspend(mp)
 1072         struct mount *mp;
 1073 {
 1074         struct thread *td = curthread;
 1075         int error;
 1076 
 1077         GIANT_REQUIRED;
 1078 
 1079         if (mp->mnt_kern_flag & MNTK_SUSPEND)
 1080                 return (0);
 1081         mp->mnt_kern_flag |= MNTK_SUSPEND;
 1082         if (mp->mnt_writeopcount > 0)
 1083                 (void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
 1084         if ((error = VFS_SYNC(mp, MNT_WAIT, td->td_ucred, td)) != 0) {
 1085                 vfs_write_resume(mp);
 1086                 return (error);
 1087         }
 1088         mp->mnt_kern_flag |= MNTK_SUSPENDED;
 1089         return (0);
 1090 }
 1091 
 1092 /*
 1093  * Request a filesystem to resume write operations.
 1094  */
 1095 void
 1096 vfs_write_resume(mp)
 1097         struct mount *mp;
 1098 {
 1099 
 1100         GIANT_REQUIRED;
 1101 
 1102         if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
 1103                 return;
 1104         mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
 1105         wakeup(&mp->mnt_writeopcount);
 1106         wakeup(&mp->mnt_flag);
 1107 }
 1108 
 1109 /*
 1110  * Implement kqueues for files by translating it to vnode operation.
 1111  */
 1112 static int
 1113 vn_kqfilter(struct file *fp, struct knote *kn)
 1114 {
 1115         int error;
 1116 
 1117         mtx_lock(&Giant);
 1118         error = VOP_KQFILTER(fp->f_vnode, kn);
 1119         mtx_unlock(&Giant);
 1120 
 1121         return error;
 1122 }
 1123 
 1124 /*
 1125  * Simplified in-kernel wrapper calls for extended attribute access.
 1126  * Both calls pass in a NULL credential, authorizing as "kernel" access.
 1127  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
 1128  */
 1129 int
 1130 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 1131     const char *attrname, int *buflen, char *buf, struct thread *td)
 1132 {
 1133         struct uio      auio;
 1134         struct iovec    iov;
 1135         int     error;
 1136 
 1137         iov.iov_len = *buflen;
 1138         iov.iov_base = buf;
 1139 
 1140         auio.uio_iov = &iov;
 1141         auio.uio_iovcnt = 1;
 1142         auio.uio_rw = UIO_READ;
 1143         auio.uio_segflg = UIO_SYSSPACE;
 1144         auio.uio_td = td;
 1145         auio.uio_offset = 0;
 1146         auio.uio_resid = *buflen;
 1147 
 1148         if ((ioflg & IO_NODELOCKED) == 0)
 1149                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 1150 
 1151         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1152 
 1153         /* authorize attribute retrieval as kernel */
 1154         error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 1155             td);
 1156 
 1157         if ((ioflg & IO_NODELOCKED) == 0)
 1158                 VOP_UNLOCK(vp, 0, td);
 1159 
 1160         if (error == 0) {
 1161                 *buflen = *buflen - auio.uio_resid;
 1162         }
 1163 
 1164         return (error);
 1165 }
 1166 
 1167 /*
 1168  * XXX failure mode if partially written?
 1169  */
 1170 int
 1171 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 1172     const char *attrname, int buflen, char *buf, struct thread *td)
 1173 {
 1174         struct uio      auio;
 1175         struct iovec    iov;
 1176         struct mount    *mp;
 1177         int     error;
 1178 
 1179         iov.iov_len = buflen;
 1180         iov.iov_base = buf;
 1181 
 1182         auio.uio_iov = &iov;
 1183         auio.uio_iovcnt = 1;
 1184         auio.uio_rw = UIO_WRITE;
 1185         auio.uio_segflg = UIO_SYSSPACE;
 1186         auio.uio_td = td;
 1187         auio.uio_offset = 0;
 1188         auio.uio_resid = buflen;
 1189 
 1190         if ((ioflg & IO_NODELOCKED) == 0) {
 1191                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 1192                         return (error);
 1193                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 1194         }
 1195 
 1196         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1197 
 1198         /* authorize attribute setting as kernel */
 1199         error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 1200 
 1201         if ((ioflg & IO_NODELOCKED) == 0) {
 1202                 vn_finished_write(mp);
 1203                 VOP_UNLOCK(vp, 0, td);
 1204         }
 1205 
 1206         return (error);
 1207 }
 1208 
 1209 int
 1210 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 1211     const char *attrname, struct thread *td)
 1212 {
 1213         struct mount    *mp;
 1214         int     error;
 1215 
 1216         if ((ioflg & IO_NODELOCKED) == 0) {
 1217                 if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 1218                         return (error);
 1219                 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 1220         }
 1221 
 1222         ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 1223 
 1224         /* authorize attribute removal as kernel */
 1225         error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 1226         if (error == EOPNOTSUPP)
 1227                 error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 1228                     NULL, td);
 1229 
 1230         if ((ioflg & IO_NODELOCKED) == 0) {
 1231                 vn_finished_write(mp);
 1232                 VOP_UNLOCK(vp, 0, td);
 1233         }
 1234 
 1235         return (error);
 1236 }
Cache object: 6d38283f44dff3ba5e7271bf70e66dfd
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/vfs_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/vfs_vnops.c