ffs_vnops.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2002 Networks Associates Technology, Inc.
    3  * All rights reserved.
    4  *
    5  * This software was developed for the FreeBSD Project by Marshall
    6  * Kirk McKusick and Network Associates Laboratories, the Security
    7  * Research Division of Network Associates, Inc. under DARPA/SPAWAR
    8  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
    9  * research program
   10  *
   11  * Copyright (c) 1982, 1986, 1989, 1993
   12  *      The Regents of the University of California.  All rights reserved.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. All advertising materials mentioning features or use of this software
   23  *    must display the following acknowledgement:
   24  *      This product includes software developed by the University of
   25  *      California, Berkeley and its contributors.
   26  * 4. Neither the name of the University nor the names of its contributors
   27  *    may be used to endorse or promote products derived from this software
   28  *    without specific prior written permission.
   29  *
   30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   40  * SUCH DAMAGE.
   41  *
   42  *      @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95
   43  * $FreeBSD: releng/5.0/sys/ufs/ffs/ffs_vnops.c 105422 2002-10-18 22:52:41Z dillon $
   44  */
   45 
   46 #include <sys/param.h>
   47 #include <sys/bio.h>
   48 #include <sys/systm.h>
   49 #include <sys/buf.h>
   50 #include <sys/conf.h>
   51 #include <sys/extattr.h>
   52 #include <sys/kernel.h>
   53 #include <sys/malloc.h>
   54 #include <sys/mount.h>
   55 #include <sys/proc.h>
   56 #include <sys/resourcevar.h>
   57 #include <sys/signalvar.h>
   58 #include <sys/stat.h>
   59 #include <sys/vmmeter.h>
   60 #include <sys/vnode.h>
   61 
   62 #include <machine/limits.h>
   63 
   64 #include <vm/vm.h>
   65 #include <vm/vm_extern.h>
   66 #include <vm/vm_object.h>
   67 #include <vm/vm_page.h>
   68 #include <vm/vm_pager.h>
   69 #include <vm/vnode_pager.h>
   70 
   71 #include <ufs/ufs/extattr.h>
   72 #include <ufs/ufs/quota.h>
   73 #include <ufs/ufs/inode.h>
   74 #include <ufs/ufs/ufs_extern.h>
   75 #include <ufs/ufs/ufsmount.h>
   76 
   77 #include <ufs/ffs/fs.h>
   78 #include <ufs/ffs/ffs_extern.h>
   79 
   80 static int      ffs_fsync(struct vop_fsync_args *);
   81 static int      ffs_getpages(struct vop_getpages_args *);
   82 static int      ffs_read(struct vop_read_args *);
   83 static int      ffs_write(struct vop_write_args *);
   84 static int      ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
   85 static int      ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
   86                     struct ucred *cred);
   87 static int      ffsext_strategy(struct vop_strategy_args *);
   88 static int      ffs_closeextattr(struct vop_closeextattr_args *);
   89 static int      ffs_getextattr(struct vop_getextattr_args *);
   90 static int      ffs_openextattr(struct vop_openextattr_args *);
   91 static int      ffs_setextattr(struct vop_setextattr_args *);
   92 
   93 
   94 /* Global vfs data structures for ufs. */
   95 vop_t **ffs_vnodeop_p;
   96 static struct vnodeopv_entry_desc ffs_vnodeop_entries[] = {
   97         { &vop_default_desc,            (vop_t *) ufs_vnoperate },
   98         { &vop_fsync_desc,              (vop_t *) ffs_fsync },
   99         { &vop_getpages_desc,           (vop_t *) ffs_getpages },
  100         { &vop_read_desc,               (vop_t *) ffs_read },
  101         { &vop_reallocblks_desc,        (vop_t *) ffs_reallocblks },
  102         { &vop_write_desc,              (vop_t *) ffs_write },
  103         { &vop_closeextattr_desc,       (vop_t *) ffs_closeextattr },
  104         { &vop_getextattr_desc,         (vop_t *) ffs_getextattr },
  105         { &vop_openextattr_desc,        (vop_t *) ffs_openextattr },
  106         { &vop_setextattr_desc,         (vop_t *) ffs_setextattr },
  107         { NULL, NULL }
  108 };
  109 static struct vnodeopv_desc ffs_vnodeop_opv_desc =
  110         { &ffs_vnodeop_p, ffs_vnodeop_entries };
  111 
  112 vop_t **ffs_specop_p;
  113 static struct vnodeopv_entry_desc ffs_specop_entries[] = {
  114         { &vop_default_desc,            (vop_t *) ufs_vnoperatespec },
  115         { &vop_fsync_desc,              (vop_t *) ffs_fsync },
  116         { &vop_reallocblks_desc,        (vop_t *) ffs_reallocblks },
  117         { &vop_strategy_desc,           (vop_t *) ffsext_strategy },
  118         { &vop_closeextattr_desc,       (vop_t *) ffs_closeextattr },
  119         { &vop_getextattr_desc,         (vop_t *) ffs_getextattr },
  120         { &vop_openextattr_desc,        (vop_t *) ffs_openextattr },
  121         { &vop_setextattr_desc,         (vop_t *) ffs_setextattr },
  122         { NULL, NULL }
  123 };
  124 static struct vnodeopv_desc ffs_specop_opv_desc =
  125         { &ffs_specop_p, ffs_specop_entries };
  126 
  127 vop_t **ffs_fifoop_p;
  128 static struct vnodeopv_entry_desc ffs_fifoop_entries[] = {
  129         { &vop_default_desc,            (vop_t *) ufs_vnoperatefifo },
  130         { &vop_fsync_desc,              (vop_t *) ffs_fsync },
  131         { &vop_reallocblks_desc,        (vop_t *) ffs_reallocblks },
  132         { &vop_strategy_desc,           (vop_t *) ffsext_strategy },
  133         { &vop_closeextattr_desc,       (vop_t *) ffs_closeextattr },
  134         { &vop_getextattr_desc,         (vop_t *) ffs_getextattr },
  135         { &vop_openextattr_desc,        (vop_t *) ffs_openextattr },
  136         { &vop_setextattr_desc,         (vop_t *) ffs_setextattr },
  137         { NULL, NULL }
  138 };
  139 static struct vnodeopv_desc ffs_fifoop_opv_desc =
  140         { &ffs_fifoop_p, ffs_fifoop_entries };
  141 
  142 VNODEOP_SET(ffs_vnodeop_opv_desc);
  143 VNODEOP_SET(ffs_specop_opv_desc);
  144 VNODEOP_SET(ffs_fifoop_opv_desc);
  145 
  146 /*
  147  * Synch an open file.
  148  */
  149 /* ARGSUSED */
  150 static int
  151 ffs_fsync(ap)
  152         struct vop_fsync_args /* {
  153                 struct vnode *a_vp;
  154                 struct ucred *a_cred;
  155                 int a_waitfor;
  156                 struct thread *a_td;
  157         } */ *ap;
  158 {
  159         struct vnode *vp = ap->a_vp;
  160         struct inode *ip = VTOI(vp);
  161         struct buf *bp;
  162         struct buf *nbp;
  163         int s, error, wait, passes, skipmeta;
  164         ufs_lbn_t lbn;
  165 
  166         wait = (ap->a_waitfor == MNT_WAIT);
  167         if (vn_isdisk(vp, NULL)) {
  168                 lbn = INT_MAX;
  169                 if (vp->v_rdev->si_mountpoint != NULL &&
  170                     (vp->v_rdev->si_mountpoint->mnt_flag & MNT_SOFTDEP))
  171                         softdep_fsync_mountdev(vp);
  172         } else {
  173                 lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
  174         }
  175 
  176         /*
  177          * Flush all dirty buffers associated with a vnode.
  178          */
  179         passes = NIADDR + 1;
  180         skipmeta = 0;
  181         if (wait)
  182                 skipmeta = 1;
  183         s = splbio();
  184         VI_LOCK(vp);
  185 loop:
  186         TAILQ_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs)
  187                 bp->b_flags &= ~B_SCANNED;
  188         for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
  189                 nbp = TAILQ_NEXT(bp, b_vnbufs);
  190                 /* 
  191                  * Reasons to skip this buffer: it has already been considered
  192                  * on this pass, this pass is the first time through on a
  193                  * synchronous flush request and the buffer being considered
  194                  * is metadata, the buffer has dependencies that will cause
  195                  * it to be redirtied and it has not already been deferred,
  196                  * or it is already being written.
  197                  */
  198                 if ((bp->b_flags & B_SCANNED) != 0)
  199                         continue;
  200                 bp->b_flags |= B_SCANNED;
  201                 if ((skipmeta == 1 && bp->b_lblkno < 0))
  202                         continue;
  203                 if (!wait && LIST_FIRST(&bp->b_dep) != NULL &&
  204                     (bp->b_flags & B_DEFERRED) == 0 &&
  205                     buf_countdeps(bp, 0)) {
  206                         bp->b_flags |= B_DEFERRED;
  207                         continue;
  208                 }
  209                 VI_UNLOCK(vp);
  210                 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
  211                         VI_LOCK(vp);
  212                         continue;
  213                 }
  214                 if ((bp->b_flags & B_DELWRI) == 0)
  215                         panic("ffs_fsync: not dirty");
  216                 if (vp != bp->b_vp)
  217                         panic("ffs_fsync: vp != vp->b_vp");
  218                 /*
  219                  * If this is a synchronous flush request, or it is not a
  220                  * file or device, start the write on this buffer immediatly.
  221                  */
  222                 if (wait || (vp->v_type != VREG && vp->v_type != VBLK)) {
  223 
  224                         /*
  225                          * On our final pass through, do all I/O synchronously
  226                          * so that we can find out if our flush is failing
  227                          * because of write errors.
  228                          */
  229                         if (passes > 0 || !wait) {
  230                                 if ((bp->b_flags & B_CLUSTEROK) && !wait) {
  231                                         BUF_UNLOCK(bp);
  232                                         (void) vfs_bio_awrite(bp);
  233                                 } else {
  234                                         bremfree(bp);
  235                                         splx(s);
  236                                         (void) bawrite(bp);
  237                                         s = splbio();
  238                                 }
  239                         } else {
  240                                 bremfree(bp);
  241                                 splx(s);
  242                                 if ((error = bwrite(bp)) != 0)
  243                                         return (error);
  244                                 s = splbio();
  245                         }
  246                 } else if ((vp->v_type == VREG) && (bp->b_lblkno >= lbn)) {
  247                         /* 
  248                          * If the buffer is for data that has been truncated
  249                          * off the file, then throw it away.
  250                          */
  251                         bremfree(bp);
  252                         bp->b_flags |= B_INVAL | B_NOCACHE;
  253                         splx(s);
  254                         brelse(bp);
  255                         s = splbio();
  256                 } else {
  257                         BUF_UNLOCK(bp);
  258                         vfs_bio_awrite(bp);
  259                 }
  260                 /*
  261                  * Since we may have slept during the I/O, we need 
  262                  * to start from a known point.
  263                  */
  264                 VI_LOCK(vp);
  265                 nbp = TAILQ_FIRST(&vp->v_dirtyblkhd);
  266         }
  267         /*
  268          * If we were asked to do this synchronously, then go back for
  269          * another pass, this time doing the metadata.
  270          */
  271         if (skipmeta) {
  272                 skipmeta = 0;
  273                 goto loop;
  274         }
  275 
  276         if (wait) {
  277                 while (vp->v_numoutput) {
  278                         vp->v_iflag |= VI_BWAIT;
  279                         msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
  280                             PRIBIO + 4, "ffsfsn", 0);
  281                 }
  282                 VI_UNLOCK(vp);
  283 
  284                 /* 
  285                  * Ensure that any filesystem metatdata associated
  286                  * with the vnode has been written.
  287                  */
  288                 splx(s);
  289                 if ((error = softdep_sync_metadata(ap)) != 0)
  290                         return (error);
  291                 s = splbio();
  292 
  293                 VI_LOCK(vp);
  294                 if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
  295                         /*
  296                          * Block devices associated with filesystems may
  297                          * have new I/O requests posted for them even if
  298                          * the vnode is locked, so no amount of trying will
  299                          * get them clean. Thus we give block devices a
  300                          * good effort, then just give up. For all other file
  301                          * types, go around and try again until it is clean.
  302                          */
  303                         if (passes > 0) {
  304                                 passes -= 1;
  305                                 goto loop;
  306                         }
  307 #ifdef DIAGNOSTIC
  308                         if (!vn_isdisk(vp, NULL))
  309                                 vprint("ffs_fsync: dirty", vp);
  310 #endif
  311                 }
  312         }
  313         VI_UNLOCK(vp);
  314         splx(s);
  315         return (UFS_UPDATE(vp, wait));
  316 }
  317 
  318 
  319 /*
  320  * Vnode op for reading.
  321  */
  322 /* ARGSUSED */
  323 static int
  324 ffs_read(ap)
  325         struct vop_read_args /* {
  326                 struct vnode *a_vp;
  327                 struct uio *a_uio;
  328                 int a_ioflag;
  329                 struct ucred *a_cred;
  330         } */ *ap;
  331 {
  332         struct vnode *vp;
  333         struct inode *ip;
  334         struct uio *uio;
  335         struct fs *fs;
  336         struct buf *bp;
  337         ufs_lbn_t lbn, nextlbn;
  338         off_t bytesinfile;
  339         long size, xfersize, blkoffset;
  340         int error, orig_resid;
  341         mode_t mode;
  342         int seqcount;
  343         int ioflag;
  344         vm_object_t object;
  345 
  346         vp = ap->a_vp;
  347         uio = ap->a_uio;
  348         ioflag = ap->a_ioflag;
  349         if (ap->a_ioflag & IO_EXT)
  350 #ifdef notyet
  351                 return (ffs_extread(vp, uio, ioflag));
  352 #else
  353                 panic("ffs_read+IO_EXT");
  354 #endif
  355 
  356         GIANT_REQUIRED;
  357 
  358         seqcount = ap->a_ioflag >> 16;
  359         ip = VTOI(vp);
  360         mode = ip->i_mode;
  361 
  362 #ifdef DIAGNOSTIC
  363         if (uio->uio_rw != UIO_READ)
  364                 panic("ffs_read: mode");
  365 
  366         if (vp->v_type == VLNK) {
  367                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
  368                         panic("ffs_read: short symlink");
  369         } else if (vp->v_type != VREG && vp->v_type != VDIR)
  370                 panic("ffs_read: type %d",  vp->v_type);
  371 #endif
  372         fs = ip->i_fs;
  373         if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
  374                 return (EFBIG);
  375 
  376         orig_resid = uio->uio_resid;
  377         if (orig_resid <= 0)
  378                 return (0);
  379 
  380         object = vp->v_object;
  381 
  382         bytesinfile = ip->i_size - uio->uio_offset;
  383         if (bytesinfile <= 0) {
  384                 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  385                         ip->i_flag |= IN_ACCESS;
  386                 return 0;
  387         }
  388 
  389         if (object) {
  390                 vm_object_reference(object);
  391         }
  392 
  393 #ifdef ENABLE_VFS_IOOPT
  394         /*
  395          * If IO optimisation is turned on,
  396          * and we are NOT a VM based IO request, 
  397          * (i.e. not headed for the buffer cache)
  398          * but there IS a vm object associated with it.
  399          */
  400         if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
  401                 int nread, toread;
  402 
  403                 toread = uio->uio_resid;
  404                 if (toread > bytesinfile)
  405                         toread = bytesinfile;
  406                 if (toread >= PAGE_SIZE) {
  407                         /*
  408                          * Then if it's at least a page in size, try 
  409                          * get the data from the object using vm tricks
  410                          */
  411                         error = uioread(toread, uio, object, &nread);
  412                         if ((uio->uio_resid == 0) || (error != 0)) {
  413                                 /*
  414                                  * If we finished or there was an error
  415                                  * then finish up (the reference previously
  416                                  * obtained on object must be released).
  417                                  */
  418                                 if ((error == 0 ||
  419                                     uio->uio_resid != orig_resid) &&
  420                                     (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  421                                         ip->i_flag |= IN_ACCESS;
  422 
  423                                 if (object) {
  424                                         vm_object_vndeallocate(object);
  425                                 }
  426                                 return error;
  427                         }
  428                 }
  429         }
  430 #endif
  431 
  432         /*
  433          * Ok so we couldn't do it all in one vm trick...
  434          * so cycle around trying smaller bites..
  435          */
  436         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
  437                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
  438                         break;
  439 #ifdef ENABLE_VFS_IOOPT
  440                 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
  441                         /*
  442                          * Obviously we didn't finish above, but we
  443                          * didn't get an error either. Try the same trick again.
  444                          * but this time we are looping.
  445                          */
  446                         int nread, toread;
  447                         toread = uio->uio_resid;
  448                         if (toread > bytesinfile)
  449                                 toread = bytesinfile;
  450 
  451                         /*
  452                          * Once again, if there isn't enough for a
  453                          * whole page, don't try optimising.
  454                          */
  455                         if (toread >= PAGE_SIZE) {
  456                                 error = uioread(toread, uio, object, &nread);
  457                                 if ((uio->uio_resid == 0) || (error != 0)) {
  458                                         /*
  459                                          * If we finished or there was an 
  460                                          * error then finish up (the reference
  461                                          * previously obtained on object must 
  462                                          * be released).
  463                                          */
  464                                         if ((error == 0 ||
  465                                             uio->uio_resid != orig_resid) &&
  466                                             (vp->v_mount->mnt_flag &
  467                                             MNT_NOATIME) == 0)
  468                                                 ip->i_flag |= IN_ACCESS;
  469                                         if (object) {
  470                                                 vm_object_vndeallocate(object);
  471                                         }
  472                                         return error;
  473                                 }
  474                                 /*
  475                                  * To get here we didnt't finish or err.
  476                                  * If we did get some data,
  477                                  * loop to try another bite.
  478                                  */
  479                                 if (nread > 0) {
  480                                         continue;
  481                                 }
  482                         }
  483                 }
  484 #endif
  485 
  486                 lbn = lblkno(fs, uio->uio_offset);
  487                 nextlbn = lbn + 1;
  488 
  489                 /*
  490                  * size of buffer.  The buffer representing the
  491                  * end of the file is rounded up to the size of
  492                  * the block type ( fragment or full block, 
  493                  * depending ).
  494                  */
  495                 size = blksize(fs, ip, lbn);
  496                 blkoffset = blkoff(fs, uio->uio_offset);
  497                 
  498                 /*
  499                  * The amount we want to transfer in this iteration is
  500                  * one FS block less the amount of the data before
  501                  * our startpoint (duh!)
  502                  */
  503                 xfersize = fs->fs_bsize - blkoffset;
  504 
  505                 /*
  506                  * But if we actually want less than the block,
  507                  * or the file doesn't have a whole block more of data,
  508                  * then use the lesser number.
  509                  */
  510                 if (uio->uio_resid < xfersize)
  511                         xfersize = uio->uio_resid;
  512                 if (bytesinfile < xfersize)
  513                         xfersize = bytesinfile;
  514 
  515                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
  516                         /*
  517                          * Don't do readahead if this is the end of the file.
  518                          */
  519                         error = bread(vp, lbn, size, NOCRED, &bp);
  520                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
  521                         /* 
  522                          * Otherwise if we are allowed to cluster,
  523                          * grab as much as we can.
  524                          *
  525                          * XXX  This may not be a win if we are not
  526                          * doing sequential access.
  527                          */
  528                         error = cluster_read(vp, ip->i_size, lbn,
  529                                 size, NOCRED, uio->uio_resid, seqcount, &bp);
  530                 } else if (seqcount > 1) {
  531                         /*
  532                          * If we are NOT allowed to cluster, then
  533                          * if we appear to be acting sequentially,
  534                          * fire off a request for a readahead
  535                          * as well as a read. Note that the 4th and 5th
  536                          * arguments point to arrays of the size specified in
  537                          * the 6th argument.
  538                          */
  539                         int nextsize = blksize(fs, ip, nextlbn);
  540                         error = breadn(vp, lbn,
  541                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
  542                 } else {
  543                         /*
  544                          * Failing all of the above, just read what the 
  545                          * user asked for. Interestingly, the same as
  546                          * the first option above.
  547                          */
  548                         error = bread(vp, lbn, size, NOCRED, &bp);
  549                 }
  550                 if (error) {
  551                         brelse(bp);
  552                         bp = NULL;
  553                         break;
  554                 }
  555 
  556                 /*
  557                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
  558                  * will cause us to attempt to release the buffer later on
  559                  * and will cause the buffer cache to attempt to free the
  560                  * underlying pages.
  561                  */
  562                 if (ioflag & IO_DIRECT)
  563                         bp->b_flags |= B_DIRECT;
  564 
  565                 /*
  566                  * We should only get non-zero b_resid when an I/O error
  567                  * has occurred, which should cause us to break above.
  568                  * However, if the short read did not cause an error,
  569                  * then we want to ensure that we do not uiomove bad
  570                  * or uninitialized data.
  571                  */
  572                 size -= bp->b_resid;
  573                 if (size < xfersize) {
  574                         if (size == 0)
  575                                 break;
  576                         xfersize = size;
  577                 }
  578 
  579 #ifdef ENABLE_VFS_IOOPT
  580                 if (vfs_ioopt && object &&
  581                     (bp->b_flags & B_VMIO) &&
  582                     ((blkoffset & PAGE_MASK) == 0) &&
  583                     ((xfersize & PAGE_MASK) == 0)) {
  584                         /*
  585                          * If VFS IO  optimisation is turned on,
  586                          * and it's an exact page multiple
  587                          * And a normal VM based op,
  588                          * then use uiomiveco()
  589                          */
  590                         error =
  591                                 uiomoveco((char *)bp->b_data + blkoffset,
  592                                         (int)xfersize, uio, object, 0);
  593                 } else 
  594 #endif
  595                 {
  596                         /*
  597                          * otherwise use the general form
  598                          */
  599                         error =
  600                                 uiomove((char *)bp->b_data + blkoffset,
  601                                         (int)xfersize, uio);
  602                 }
  603 
  604                 if (error)
  605                         break;
  606 
  607                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
  608                    (LIST_FIRST(&bp->b_dep) == NULL)) {
  609                         /*
  610                          * If there are no dependencies, and it's VMIO,
  611                          * then we don't need the buf, mark it available
  612                          * for freeing. The VM has the data.
  613                          */
  614                         bp->b_flags |= B_RELBUF;
  615                         brelse(bp);
  616                 } else {
  617                         /*
  618                          * Otherwise let whoever
  619                          * made the request take care of
  620                          * freeing it. We just queue
  621                          * it onto another list.
  622                          */
  623                         bqrelse(bp);
  624                 }
  625         }
  626 
  627         /* 
  628          * This can only happen in the case of an error
  629          * because the loop above resets bp to NULL on each iteration
  630          * and on normal completion has not set a new value into it.
  631          * so it must have come from a 'break' statement
  632          */
  633         if (bp != NULL) {
  634                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
  635                    (LIST_FIRST(&bp->b_dep) == NULL)) {
  636                         bp->b_flags |= B_RELBUF;
  637                         brelse(bp);
  638                 } else {
  639                         bqrelse(bp);
  640                 }
  641         }
  642 
  643         if (object) {
  644                 vm_object_vndeallocate(object);
  645         }
  646         if ((error == 0 || uio->uio_resid != orig_resid) &&
  647             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
  648                 ip->i_flag |= IN_ACCESS;
  649         return (error);
  650 }
  651 
  652 /*
  653  * Vnode op for writing.
  654  */
  655 static int
  656 ffs_write(ap)
  657         struct vop_write_args /* {
  658                 struct vnode *a_vp;
  659                 struct uio *a_uio;
  660                 int a_ioflag;
  661                 struct ucred *a_cred;
  662         } */ *ap;
  663 {
  664         struct vnode *vp;
  665         struct uio *uio;
  666         struct inode *ip;
  667         struct fs *fs;
  668         struct buf *bp;
  669         struct thread *td;
  670         ufs_lbn_t lbn;
  671         off_t osize;
  672         int seqcount;
  673         int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
  674         vm_object_t object;
  675 
  676         vp = ap->a_vp;
  677         uio = ap->a_uio;
  678         ioflag = ap->a_ioflag;
  679         if (ap->a_ioflag & IO_EXT)
  680 #ifdef notyet
  681                 return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
  682 #else
  683                 panic("ffs_read+IO_EXT");
  684 #endif
  685 
  686         GIANT_REQUIRED;
  687 
  688         extended = 0;
  689         seqcount = ap->a_ioflag >> 16;
  690         ip = VTOI(vp);
  691 
  692         object = vp->v_object;
  693         if (object) {
  694                 vm_object_reference(object);
  695         }
  696 
  697 #ifdef DIAGNOSTIC
  698         if (uio->uio_rw != UIO_WRITE)
  699                 panic("ffswrite: mode");
  700 #endif
  701 
  702         switch (vp->v_type) {
  703         case VREG:
  704                 if (ioflag & IO_APPEND)
  705                         uio->uio_offset = ip->i_size;
  706                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
  707                         if (object) {
  708                                 vm_object_vndeallocate(object);
  709                         }
  710                         return (EPERM);
  711                 }
  712                 /* FALLTHROUGH */
  713         case VLNK:
  714                 break;
  715         case VDIR:
  716                 panic("ffswrite: dir write");
  717                 break;
  718         default:
  719                 panic("ffswrite: type %p %d (%d,%d)", vp, (int)vp->v_type,
  720                         (int)uio->uio_offset,
  721                         (int)uio->uio_resid
  722                 );
  723         }
  724 
  725         fs = ip->i_fs;
  726         if (uio->uio_offset < 0 ||
  727             (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
  728                 if (object) {
  729                         vm_object_vndeallocate(object);
  730                 }
  731                 return (EFBIG);
  732         }
  733         /*
  734          * Maybe this should be above the vnode op call, but so long as
  735          * file servers have no limits, I don't think it matters.
  736          */
  737         td = uio->uio_td;
  738         if (vp->v_type == VREG && td &&
  739             uio->uio_offset + uio->uio_resid >
  740             td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
  741                 PROC_LOCK(td->td_proc);
  742                 psignal(td->td_proc, SIGXFSZ);
  743                 PROC_UNLOCK(td->td_proc);
  744                 if (object) {
  745                         vm_object_vndeallocate(object);
  746                 }
  747                 return (EFBIG);
  748         }
  749 
  750         resid = uio->uio_resid;
  751         osize = ip->i_size;
  752         if (seqcount > BA_SEQMAX)
  753                 flags = BA_SEQMAX << BA_SEQSHIFT;
  754         else
  755                 flags = seqcount << BA_SEQSHIFT;
  756         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
  757                 flags |= IO_SYNC;
  758 
  759 #ifdef ENABLE_VFS_IOOPT
  760         if (object && (object->flags & OBJ_OPT)) {
  761                 vm_freeze_copyopts(object,
  762                         OFF_TO_IDX(uio->uio_offset),
  763                         OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
  764         }
  765 #endif
  766         for (error = 0; uio->uio_resid > 0;) {
  767                 lbn = lblkno(fs, uio->uio_offset);
  768                 blkoffset = blkoff(fs, uio->uio_offset);
  769                 xfersize = fs->fs_bsize - blkoffset;
  770                 if (uio->uio_resid < xfersize)
  771                         xfersize = uio->uio_resid;
  772 
  773                 if (uio->uio_offset + xfersize > ip->i_size)
  774                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
  775 
  776                 /*      
  777                  * We must perform a read-before-write if the transfer size
  778                  * does not cover the entire buffer.
  779                  */
  780                 if (fs->fs_bsize > xfersize)
  781                         flags |= BA_CLRBUF;
  782                 else
  783                         flags &= ~BA_CLRBUF;
  784 /* XXX is uio->uio_offset the right thing here? */
  785                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
  786                     ap->a_cred, flags, &bp);
  787                 if (error != 0)
  788                         break;
  789                 /*
  790                  * If the buffer is not valid we have to clear out any
  791                  * garbage data from the pages instantiated for the buffer.
  792                  * If we do not, a failed uiomove() during a write can leave
  793                  * the prior contents of the pages exposed to a userland
  794                  * mmap().  XXX deal with uiomove() errors a better way.
  795                  */
  796                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
  797                         vfs_bio_clrbuf(bp);
  798                 if (ioflag & IO_DIRECT)
  799                         bp->b_flags |= B_DIRECT;
  800                 if (ioflag & IO_NOWDRAIN)
  801                         bp->b_flags |= B_NOWDRAIN;
  802 
  803                 if (uio->uio_offset + xfersize > ip->i_size) {
  804                         ip->i_size = uio->uio_offset + xfersize;
  805                         DIP(ip, i_size) = ip->i_size;
  806                         extended = 1;
  807                 }
  808 
  809                 size = blksize(fs, ip, lbn) - bp->b_resid;
  810                 if (size < xfersize)
  811                         xfersize = size;
  812 
  813                 error =
  814                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
  815                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
  816                    (LIST_FIRST(&bp->b_dep) == NULL)) {
  817                         bp->b_flags |= B_RELBUF;
  818                 }
  819 
  820                 /*
  821                  * If IO_SYNC each buffer is written synchronously.  Otherwise
  822                  * if we have a severe page deficiency write the buffer 
  823                  * asynchronously.  Otherwise try to cluster, and if that
  824                  * doesn't do it then either do an async write (if O_DIRECT),
  825                  * or a delayed write (if not).
  826                  */
  827                 if (ioflag & IO_SYNC) {
  828                         (void)bwrite(bp);
  829                 } else if (vm_page_count_severe() ||
  830                             buf_dirty_count_severe() ||
  831                             (ioflag & IO_ASYNC)) {
  832                         bp->b_flags |= B_CLUSTEROK;
  833                         bawrite(bp);
  834                 } else if (xfersize + blkoffset == fs->fs_bsize) {
  835                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
  836                                 bp->b_flags |= B_CLUSTEROK;
  837                                 cluster_write(bp, ip->i_size, seqcount);
  838                         } else {
  839                                 bawrite(bp);
  840                         }
  841                 } else if (ioflag & IO_DIRECT) {
  842                         bp->b_flags |= B_CLUSTEROK;
  843                         bawrite(bp);
  844                 } else {
  845                         bp->b_flags |= B_CLUSTEROK;
  846                         bdwrite(bp);
  847                 }
  848                 if (error || xfersize == 0)
  849                         break;
  850                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
  851         }
  852         /*
  853          * If we successfully wrote any data, and we are not the superuser
  854          * we clear the setuid and setgid bits as a precaution against
  855          * tampering.
  856          */
  857         if (resid > uio->uio_resid && ap->a_cred && 
  858             suser_cred(ap->a_cred, PRISON_ROOT)) {
  859                 ip->i_mode &= ~(ISUID | ISGID);
  860                 DIP(ip, i_mode) = ip->i_mode;
  861         }
  862         if (resid > uio->uio_resid)
  863                 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
  864         if (error) {
  865                 if (ioflag & IO_UNIT) {
  866                         (void)UFS_TRUNCATE(vp, osize,
  867                             IO_NORMAL | (ioflag & IO_SYNC),
  868                             ap->a_cred, uio->uio_td);
  869                         uio->uio_offset -= resid - uio->uio_resid;
  870                         uio->uio_resid = resid;
  871                 }
  872         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
  873                 error = UFS_UPDATE(vp, 1);
  874 
  875         if (object) {
  876                 vm_object_vndeallocate(object);
  877         }
  878 
  879         return (error);
  880 }
  881 
  882 /*
  883  * get page routine
  884  */
  885 static int
  886 ffs_getpages(ap)
  887         struct vop_getpages_args *ap;
  888 {
  889         off_t foff, physoffset;
  890         int i, size, bsize;
  891         struct vnode *dp, *vp;
  892         vm_object_t obj;
  893         vm_pindex_t pindex, firstindex;
  894         vm_page_t mreq;
  895         int bbackwards, bforwards;
  896         int pbackwards, pforwards;
  897         int firstpage;
  898         ufs2_daddr_t reqblkno, reqlblkno;
  899         int poff;
  900         int pcount;
  901         int rtval;
  902         int pagesperblock;
  903 
  904         GIANT_REQUIRED;
  905 
  906         pcount = round_page(ap->a_count) / PAGE_SIZE;
  907         mreq = ap->a_m[ap->a_reqpage];
  908         firstindex = ap->a_m[0]->pindex;
  909 
  910         /*
  911          * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
  912          * then the entire page is valid.  Since the page may be mapped,
  913          * user programs might reference data beyond the actual end of file
  914          * occuring within the page.  We have to zero that data.
  915          */
  916         if (mreq->valid) {
  917                 if (mreq->valid != VM_PAGE_BITS_ALL)
  918                         vm_page_zero_invalid(mreq, TRUE);
  919                 vm_page_lock_queues();
  920                 for (i = 0; i < pcount; i++) {
  921                         if (i != ap->a_reqpage) {
  922                                 vm_page_free(ap->a_m[i]);
  923                         }
  924                 }
  925                 vm_page_unlock_queues();
  926                 return VM_PAGER_OK;
  927         }
  928 
  929         vp = ap->a_vp;
  930         obj = vp->v_object;
  931         bsize = vp->v_mount->mnt_stat.f_iosize;
  932         pindex = mreq->pindex;
  933         foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
  934 
  935         if (bsize < PAGE_SIZE)
  936                 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
  937                                                     ap->a_count,
  938                                                     ap->a_reqpage);
  939 
  940         /*
  941          * foff is the file offset of the required page
  942          * reqlblkno is the logical block that contains the page
  943          * poff is the index of the page into the logical block
  944          */
  945         reqlblkno = foff / bsize;
  946         poff = (foff % bsize) / PAGE_SIZE;
  947 
  948         dp = VTOI(vp)->i_devvp;
  949         if (ufs_bmaparray(vp, reqlblkno, &reqblkno, 0, &bforwards, &bbackwards)
  950             || (reqblkno == -1)) {
  951                 vm_page_lock_queues();
  952                 for(i = 0; i < pcount; i++) {
  953                         if (i != ap->a_reqpage)
  954                                 vm_page_free(ap->a_m[i]);
  955                 }
  956                 vm_page_unlock_queues();
  957                 if (reqblkno == -1) {
  958                         if ((mreq->flags & PG_ZERO) == 0)
  959                                 pmap_zero_page(mreq);
  960                         vm_page_undirty(mreq);
  961                         mreq->valid = VM_PAGE_BITS_ALL;
  962                         return VM_PAGER_OK;
  963                 } else {
  964                         return VM_PAGER_ERROR;
  965                 }
  966         }
  967 
  968         physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
  969         pagesperblock = bsize / PAGE_SIZE;
  970         /*
  971          * find the first page that is contiguous...
  972          * note that pbackwards is the number of pages that are contiguous
  973          * backwards.
  974          */
  975         firstpage = 0;
  976         if (ap->a_count) {
  977                 pbackwards = poff + bbackwards * pagesperblock;
  978                 if (ap->a_reqpage > pbackwards) {
  979                         firstpage = ap->a_reqpage - pbackwards;
  980                         vm_page_lock_queues();
  981                         for(i=0;i<firstpage;i++)
  982                                 vm_page_free(ap->a_m[i]);
  983                         vm_page_unlock_queues();
  984                 }
  985 
  986         /*
  987          * pforwards is the number of pages that are contiguous
  988          * after the current page.
  989          */
  990                 pforwards = (pagesperblock - (poff + 1)) +
  991                         bforwards * pagesperblock;
  992                 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
  993                         vm_page_lock_queues();
  994                         for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
  995                                 vm_page_free(ap->a_m[i]);
  996                         vm_page_unlock_queues();
  997                         pcount = ap->a_reqpage + pforwards + 1;
  998                 }
  999 
 1000         /*
 1001          * number of pages for I/O corrected for the non-contig pages at
 1002          * the beginning of the array.
 1003          */
 1004                 pcount -= firstpage;
 1005         }
 1006 
 1007         /*
 1008          * calculate the size of the transfer
 1009          */
 1010 
 1011         size = pcount * PAGE_SIZE;
 1012 
 1013         if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
 1014                 obj->un_pager.vnp.vnp_size)
 1015                 size = obj->un_pager.vnp.vnp_size -
 1016                         IDX_TO_OFF(ap->a_m[firstpage]->pindex);
 1017 
 1018         physoffset -= foff;
 1019         rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
 1020                 (ap->a_reqpage - firstpage), physoffset);
 1021 
 1022         return (rtval);
 1023 }
 1024 
 1025 /*
 1026  * Extended attribute area reading.
 1027  */
 1028 static int
 1029 ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
 1030 {
 1031         struct inode *ip;
 1032         struct ufs2_dinode *dp;
 1033         struct fs *fs;
 1034         struct buf *bp;
 1035         ufs_lbn_t lbn, nextlbn;
 1036         off_t bytesinfile;
 1037         long size, xfersize, blkoffset;
 1038         int error, orig_resid;
 1039         mode_t mode;
 1040 
 1041         GIANT_REQUIRED;
 1042 
 1043         ip = VTOI(vp);
 1044         fs = ip->i_fs;
 1045         dp = ip->i_din2;
 1046         mode = ip->i_mode;
 1047 
 1048 #ifdef DIAGNOSTIC
 1049         if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
 1050                 panic("ffs_extread: mode");
 1051 
 1052 #endif
 1053         orig_resid = uio->uio_resid;
 1054         if (orig_resid <= 0)
 1055                 return (0);
 1056 
 1057         bytesinfile = dp->di_extsize - uio->uio_offset;
 1058         if (bytesinfile <= 0) {
 1059                 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 1060                         ip->i_flag |= IN_ACCESS;
 1061                 return 0;
 1062         }
 1063 
 1064         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 1065                 if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
 1066                         break;
 1067 
 1068                 lbn = lblkno(fs, uio->uio_offset);
 1069                 nextlbn = lbn + 1;
 1070 
 1071                 /*
 1072                  * size of buffer.  The buffer representing the
 1073                  * end of the file is rounded up to the size of
 1074                  * the block type ( fragment or full block, 
 1075                  * depending ).
 1076                  */
 1077                 size = sblksize(fs, dp->di_extsize, lbn);
 1078                 blkoffset = blkoff(fs, uio->uio_offset);
 1079                 
 1080                 /*
 1081                  * The amount we want to transfer in this iteration is
 1082                  * one FS block less the amount of the data before
 1083                  * our startpoint (duh!)
 1084                  */
 1085                 xfersize = fs->fs_bsize - blkoffset;
 1086 
 1087                 /*
 1088                  * But if we actually want less than the block,
 1089                  * or the file doesn't have a whole block more of data,
 1090                  * then use the lesser number.
 1091                  */
 1092                 if (uio->uio_resid < xfersize)
 1093                         xfersize = uio->uio_resid;
 1094                 if (bytesinfile < xfersize)
 1095                         xfersize = bytesinfile;
 1096 
 1097                 if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
 1098                         /*
 1099                          * Don't do readahead if this is the end of the info.
 1100                          */
 1101                         error = bread(vp, -1 - lbn, size, NOCRED, &bp);
 1102                 } else {
 1103                         /*
 1104                          * If we have a second block, then
 1105                          * fire off a request for a readahead
 1106                          * as well as a read. Note that the 4th and 5th
 1107                          * arguments point to arrays of the size specified in
 1108                          * the 6th argument.
 1109                          */
 1110                         int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
 1111 
 1112                         nextlbn = -1 - nextlbn;
 1113                         error = breadn(vp, -1 - lbn,
 1114                             size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 1115                 }
 1116                 if (error) {
 1117                         brelse(bp);
 1118                         bp = NULL;
 1119                         break;
 1120                 }
 1121 
 1122                 /*
 1123                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 1124                  * will cause us to attempt to release the buffer later on
 1125                  * and will cause the buffer cache to attempt to free the
 1126                  * underlying pages.
 1127                  */
 1128                 if (ioflag & IO_DIRECT)
 1129                         bp->b_flags |= B_DIRECT;
 1130 
 1131                 /*
 1132                  * We should only get non-zero b_resid when an I/O error
 1133                  * has occurred, which should cause us to break above.
 1134                  * However, if the short read did not cause an error,
 1135                  * then we want to ensure that we do not uiomove bad
 1136                  * or uninitialized data.
 1137                  */
 1138                 size -= bp->b_resid;
 1139                 if (size < xfersize) {
 1140                         if (size == 0)
 1141                                 break;
 1142                         xfersize = size;
 1143                 }
 1144 
 1145                 error = uiomove((char *)bp->b_data + blkoffset,
 1146                                         (int)xfersize, uio);
 1147                 if (error)
 1148                         break;
 1149 
 1150                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 1151                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 1152                         /*
 1153                          * If there are no dependencies, and it's VMIO,
 1154                          * then we don't need the buf, mark it available
 1155                          * for freeing. The VM has the data.
 1156                          */
 1157                         bp->b_flags |= B_RELBUF;
 1158                         brelse(bp);
 1159                 } else {
 1160                         /*
 1161                          * Otherwise let whoever
 1162                          * made the request take care of
 1163                          * freeing it. We just queue
 1164                          * it onto another list.
 1165                          */
 1166                         bqrelse(bp);
 1167                 }
 1168         }
 1169 
 1170         /* 
 1171          * This can only happen in the case of an error
 1172          * because the loop above resets bp to NULL on each iteration
 1173          * and on normal completion has not set a new value into it.
 1174          * so it must have come from a 'break' statement
 1175          */
 1176         if (bp != NULL) {
 1177                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 1178                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 1179                         bp->b_flags |= B_RELBUF;
 1180                         brelse(bp);
 1181                 } else {
 1182                         bqrelse(bp);
 1183                 }
 1184         }
 1185 
 1186         if ((error == 0 || uio->uio_resid != orig_resid) &&
 1187             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 1188                 ip->i_flag |= IN_ACCESS;
 1189         return (error);
 1190 }
 1191 
 1192 /*
 1193  * Extended attribute area writing.
 1194  */
 1195 static int
 1196 ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
 1197 {
 1198         struct inode *ip;
 1199         struct ufs2_dinode *dp;
 1200         struct fs *fs;
 1201         struct buf *bp;
 1202         ufs_lbn_t lbn;
 1203         off_t osize;
 1204         int blkoffset, error, flags, resid, size, xfersize;
 1205 
 1206         GIANT_REQUIRED;
 1207 
 1208         ip = VTOI(vp);
 1209         fs = ip->i_fs;
 1210         dp = ip->i_din2;
 1211 
 1212 #ifdef DIAGNOSTIC
 1213         if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
 1214                 panic("ext_write: mode");
 1215 #endif
 1216 
 1217         if (ioflag & IO_APPEND)
 1218                 uio->uio_offset = dp->di_extsize;
 1219 
 1220         if (uio->uio_offset < 0 ||
 1221             (u_int64_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
 1222                 return (EFBIG);
 1223 
 1224         resid = uio->uio_resid;
 1225         osize = dp->di_extsize;
 1226         flags = IO_EXT;
 1227         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 1228                 flags |= IO_SYNC;
 1229 
 1230         for (error = 0; uio->uio_resid > 0;) {
 1231                 lbn = lblkno(fs, uio->uio_offset);
 1232                 blkoffset = blkoff(fs, uio->uio_offset);
 1233                 xfersize = fs->fs_bsize - blkoffset;
 1234                 if (uio->uio_resid < xfersize)
 1235                         xfersize = uio->uio_resid;
 1236 
 1237                 /*      
 1238                  * We must perform a read-before-write if the transfer size
 1239                  * does not cover the entire buffer.
 1240                  */
 1241                 if (fs->fs_bsize > xfersize)
 1242                         flags |= BA_CLRBUF;
 1243                 else
 1244                         flags &= ~BA_CLRBUF;
 1245                 error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
 1246                     ucred, flags, &bp);
 1247                 if (error != 0)
 1248                         break;
 1249                 /*
 1250                  * If the buffer is not valid we have to clear out any
 1251                  * garbage data from the pages instantiated for the buffer.
 1252                  * If we do not, a failed uiomove() during a write can leave
 1253                  * the prior contents of the pages exposed to a userland
 1254                  * mmap().  XXX deal with uiomove() errors a better way.
 1255                  */
 1256                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 1257                         vfs_bio_clrbuf(bp);
 1258                 if (ioflag & IO_DIRECT)
 1259                         bp->b_flags |= B_DIRECT;
 1260                 if (ioflag & IO_NOWDRAIN)
 1261                         bp->b_flags |= B_NOWDRAIN;
 1262 
 1263                 if (uio->uio_offset + xfersize > dp->di_extsize)
 1264                         dp->di_extsize = uio->uio_offset + xfersize;
 1265 
 1266                 size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
 1267                 if (size < xfersize)
 1268                         xfersize = size;
 1269 
 1270                 error =
 1271                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 1272                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 1273                    (LIST_FIRST(&bp->b_dep) == NULL)) {
 1274                         bp->b_flags |= B_RELBUF;
 1275                 }
 1276 
 1277                 /*
 1278                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 1279                  * if we have a severe page deficiency write the buffer 
 1280                  * asynchronously.  Otherwise try to cluster, and if that
 1281                  * doesn't do it then either do an async write (if O_DIRECT),
 1282                  * or a delayed write (if not).
 1283                  */
 1284                 if (ioflag & IO_SYNC) {
 1285                         (void)bwrite(bp);
 1286                 } else if (vm_page_count_severe() ||
 1287                             buf_dirty_count_severe() ||
 1288                             xfersize + blkoffset == fs->fs_bsize ||
 1289                             (ioflag & (IO_ASYNC | IO_DIRECT)))
 1290                         bawrite(bp);
 1291                 else
 1292                         bdwrite(bp);
 1293                 if (error || xfersize == 0)
 1294                         break;
 1295                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 1296         }
 1297         /*
 1298          * If we successfully wrote any data, and we are not the superuser
 1299          * we clear the setuid and setgid bits as a precaution against
 1300          * tampering.
 1301          */
 1302         if (resid > uio->uio_resid && ucred && 
 1303             suser_cred(ucred, PRISON_ROOT)) {
 1304                 ip->i_mode &= ~(ISUID | ISGID);
 1305                 dp->di_mode = ip->i_mode;
 1306         }
 1307         if (error) {
 1308                 if (ioflag & IO_UNIT) {
 1309                         (void)UFS_TRUNCATE(vp, osize,
 1310                             IO_EXT | (ioflag&IO_SYNC), ucred, uio->uio_td);
 1311                         uio->uio_offset -= resid - uio->uio_resid;
 1312                         uio->uio_resid = resid;
 1313                 }
 1314         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 1315                 error = UFS_UPDATE(vp, 1);
 1316         return (error);
 1317 }
 1318 
 1319 
 1320 /*
 1321  * Vnode operating to retrieve a named extended attribute.
 1322  *
 1323  * Locate a particular EA (nspace:name) in the area (ptr:length), and return
 1324  * the length of the EA, and possibly the pointer to the entry and to the data.
 1325  */
 1326 static int
 1327 ffs_findextattr(u_char *ptr, uint length, int nspace, const char *name, u_char **eap, u_char **eac)
 1328 {
 1329         u_char *p, *pe, *pn, *p0;
 1330         int eapad1, eapad2, ealength, ealen, nlen;
 1331         uint32_t ul;
 1332 
 1333         pe = ptr + length;
 1334         nlen = strlen(name);
 1335 
 1336         for (p = ptr; p < pe; p = pn) {
 1337                 p0 = p;
 1338                 bcopy(p, &ul, sizeof(ul));
 1339                 pn = p + ul;
 1340                 /* make sure this entry is complete */
 1341                 if (pn > pe)
 1342                         break;
 1343                 p += sizeof(uint32_t);
 1344                 if (*p != nspace)
 1345                         continue;
 1346                 p++;
 1347                 eapad2 = *p++;
 1348                 if (*p != nlen)
 1349                         continue;
 1350                 p++;
 1351                 if (bcmp(p, name, nlen))
 1352                         continue;
 1353                 ealength = sizeof(uint32_t) + 3 + nlen;
 1354                 eapad1 = 8 - (ealength % 8);
 1355                 if (eapad1 == 8)
 1356                         eapad1 = 0;
 1357                 ealength += eapad1;
 1358                 ealen = ul - ealength - eapad2;
 1359                 p += nlen + eapad1;
 1360                 if (eap != NULL)
 1361                         *eap = p0;
 1362                 if (eac != NULL)
 1363                         *eac = p;
 1364                 return (ealen);
 1365         }
 1366         return(-1);
 1367 }
 1368 
 1369 static int
 1370 ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
 1371 {
 1372         struct inode *ip;
 1373         struct fs *fs;
 1374         struct ufs2_dinode *dp;
 1375         struct uio luio;
 1376         struct iovec liovec;
 1377         int easize, error;
 1378         u_char *eae;
 1379 
 1380         ip = VTOI(vp);
 1381         fs = ip->i_fs;
 1382         dp = ip->i_din2;
 1383         easize = dp->di_extsize;
 1384 
 1385         eae = malloc(easize + extra, M_TEMP, M_WAITOK);
 1386 
 1387         liovec.iov_base = eae;
 1388         liovec.iov_len = easize;
 1389         luio.uio_iov = &liovec;
 1390         luio.uio_iovcnt = 1;
 1391         luio.uio_offset = 0;
 1392         luio.uio_resid = easize;
 1393         luio.uio_segflg = UIO_SYSSPACE;
 1394         luio.uio_rw = UIO_READ;
 1395         luio.uio_td = td;
 1396 
 1397         error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
 1398         if (error) {
 1399                 free(eae, M_TEMP);
 1400                 return(error);
 1401         }
 1402         *p = eae;
 1403         return (0);
 1404 }
 1405 
 1406 static int
 1407 ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
 1408 {
 1409         struct inode *ip;
 1410         struct fs *fs;
 1411         struct ufs2_dinode *dp;
 1412         int error;
 1413 
 1414         ip = VTOI(vp);
 1415         fs = ip->i_fs;
 1416 
 1417         if (ip->i_ea_area != NULL)
 1418                 return (EBUSY);
 1419         dp = ip->i_din2;
 1420         error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
 1421         if (error)
 1422                 return (error);
 1423         ip->i_ea_len = dp->di_extsize;
 1424         ip->i_ea_error = 0;
 1425         return (0);
 1426 }
 1427 
 1428 /*
 1429  * Vnode extattr transaction commit/abort
 1430  */
 1431 static int
 1432 ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
 1433 {
 1434         struct inode *ip;
 1435         struct fs *fs;
 1436         struct uio luio;
 1437         struct iovec liovec;
 1438         int error;
 1439         struct ufs2_dinode *dp;
 1440 
 1441         ip = VTOI(vp);
 1442         fs = ip->i_fs;
 1443         if (ip->i_ea_area == NULL)
 1444                 return (EINVAL);
 1445         dp = ip->i_din2;
 1446         error = ip->i_ea_error;
 1447         if (commit && error == 0) {
 1448                 if (cred == NOCRED)
 1449                         cred =  vp->v_mount->mnt_cred;
 1450                 liovec.iov_base = ip->i_ea_area;
 1451                 liovec.iov_len = ip->i_ea_len;
 1452                 luio.uio_iov = &liovec;
 1453                 luio.uio_iovcnt = 1;
 1454                 luio.uio_offset = 0;
 1455                 luio.uio_resid = ip->i_ea_len;
 1456                 luio.uio_segflg = UIO_SYSSPACE;
 1457                 luio.uio_rw = UIO_WRITE;
 1458                 luio.uio_td = td;
 1459                 /* XXX: I'm not happy about truncating to zero size */
 1460                 if (ip->i_ea_len < dp->di_extsize)
 1461                         error = ffs_truncate(vp, 0, IO_EXT, cred, td);
 1462                 error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
 1463         }
 1464         free(ip->i_ea_area, M_TEMP);
 1465         ip->i_ea_area = NULL;
 1466         ip->i_ea_len = 0;
 1467         ip->i_ea_error = 0;
 1468         return (error);
 1469 }
 1470 
 1471 /*
 1472  * Vnode extattr strategy routine for special devices and fifos.
 1473  *
 1474  * We need to check for a read or write of the external attributes.
 1475  * Otherwise we just fall through and do the usual thing.
 1476  */
 1477 static int
 1478 ffsext_strategy(struct vop_strategy_args *ap)
 1479 /*
 1480 struct vop_strategy_args {
 1481         struct vnodeop_desc *a_desc;
 1482         struct vnode *a_vp;
 1483         struct buf *a_bp;
 1484 };
 1485 */
 1486 {
 1487         struct vnode *vp;
 1488         daddr_t lbn;
 1489 
 1490         vp = ap->a_vp;
 1491         lbn = ap->a_bp->b_lblkno;
 1492         if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
 1493             lbn < 0 && lbn >= -NXADDR)
 1494                 return (ufs_vnoperate((struct vop_generic_args *)ap));
 1495         if (vp->v_type == VFIFO)
 1496                 return (ufs_vnoperatefifo((struct vop_generic_args *)ap));
 1497         return (ufs_vnoperatespec((struct vop_generic_args *)ap));
 1498 }
 1499 
 1500 /*
 1501  * Vnode extattr transaction commit/abort
 1502  */
 1503 static int
 1504 ffs_openextattr(struct vop_openextattr_args *ap)
 1505 /*
 1506 struct vop_openextattr_args {
 1507         struct vnodeop_desc *a_desc;
 1508         struct vnode *a_vp;
 1509         IN struct ucred *a_cred;
 1510         IN struct thread *a_td;
 1511 };
 1512 */
 1513 {
 1514         struct inode *ip;
 1515         struct fs *fs;
 1516 
 1517         ip = VTOI(ap->a_vp);
 1518         fs = ip->i_fs;
 1519         if (fs->fs_magic == FS_UFS1_MAGIC)
 1520                 return (ufs_vnoperate((struct vop_generic_args *)ap));
 1521         return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
 1522 }
 1523 
 1524 
 1525 /*
 1526  * Vnode extattr transaction commit/abort
 1527  */
 1528 static int
 1529 ffs_closeextattr(struct vop_closeextattr_args *ap)
 1530 /*
 1531 struct vop_closeextattr_args {
 1532         struct vnodeop_desc *a_desc;
 1533         struct vnode *a_vp;
 1534         int a_commit;
 1535         IN struct ucred *a_cred;
 1536         IN struct thread *a_td;
 1537 };
 1538 */
 1539 {
 1540         struct inode *ip;
 1541         struct fs *fs;
 1542 
 1543         ip = VTOI(ap->a_vp);
 1544         fs = ip->i_fs;
 1545         if (fs->fs_magic == FS_UFS1_MAGIC)
 1546                 return (ufs_vnoperate((struct vop_generic_args *)ap));
 1547         return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
 1548 }
 1549 
 1550 
 1551 
 1552 /*
 1553  * Vnode operation to retrieve a named extended attribute.
 1554  */
 1555 static int
 1556 ffs_getextattr(struct vop_getextattr_args *ap)
 1557 /*
 1558 vop_getextattr {
 1559         IN struct vnode *a_vp;
 1560         IN int a_attrnamespace;
 1561         IN const char *a_name;
 1562         INOUT struct uio *a_uio;
 1563         OUT size_t *a_size;
 1564         IN struct ucred *a_cred;
 1565         IN struct thread *a_td;
 1566 };
 1567 */
 1568 {
 1569         struct inode *ip;
 1570         struct fs *fs;
 1571         u_char *eae, *p, *pe, *pn;
 1572         struct ufs2_dinode *dp;
 1573         unsigned easize;
 1574         uint32_t ul;
 1575         int error, ealen, stand_alone;
 1576 
 1577         ip = VTOI(ap->a_vp);
 1578         fs = ip->i_fs;
 1579 
 1580         if (fs->fs_magic == FS_UFS1_MAGIC)
 1581                 return (ufs_vnoperate((struct vop_generic_args *)ap));
 1582 
 1583         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 1584             ap->a_cred, ap->a_td, IREAD);
 1585         if (error)
 1586                 return (error);
 1587 
 1588         if (ip->i_ea_area == NULL) {
 1589                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 1590                 if (error)
 1591                         return (error);
 1592                 stand_alone = 1;
 1593         } else {
 1594                 stand_alone = 0;
 1595         }
 1596         dp = ip->i_din2;
 1597         eae = ip->i_ea_area;
 1598         easize = ip->i_ea_len;
 1599         if (strlen(ap->a_name) > 0) {
 1600                 ealen = ffs_findextattr(eae, easize,
 1601                     ap->a_attrnamespace, ap->a_name, NULL, &p);
 1602                 if (ealen >= 0) {
 1603                         error = 0;
 1604                         if (ap->a_size != NULL)
 1605                                 *ap->a_size = ealen;
 1606                         else if (ap->a_uio != NULL)
 1607                                 error = uiomove(p, ealen, ap->a_uio);
 1608                 } else {
 1609                         error = ENOATTR;
 1610                 }
 1611         } else {
 1612                 error = 0;
 1613                 if (ap->a_size != NULL)
 1614                         *ap->a_size = 0;
 1615                 pe = eae + easize;
 1616                 for(p = eae; error == 0 && p < pe; p = pn) {
 1617                         bcopy(p, &ul, sizeof(ul));
 1618                         pn = p + ul;
 1619                         if (pn > pe)
 1620                                 break;
 1621                         p += sizeof(ul);
 1622                         if (*p++ != ap->a_attrnamespace)
 1623                                 continue;
 1624                         p++;    /* pad2 */
 1625                         ealen = *p;
 1626                         if (ap->a_size != NULL) {
 1627                                 *ap->a_size += ealen + 1;
 1628                         } else if (ap->a_uio != NULL) {
 1629                                 error = uiomove(p, ealen + 1, ap->a_uio);
 1630                         }
 1631                 }
 1632         }
 1633         if (stand_alone)
 1634                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 1635         return(error);
 1636 }
 1637 
 1638 /*
 1639  * Vnode operation to set a named attribute.
 1640  */
 1641 static int
 1642 ffs_setextattr(struct vop_setextattr_args *ap)
 1643 /*
 1644 vop_setextattr {
 1645         IN struct vnode *a_vp;
 1646         IN int a_attrnamespace;
 1647         IN const char *a_name;
 1648         INOUT struct uio *a_uio;
 1649         IN struct ucred *a_cred;
 1650         IN struct thread *a_td;
 1651 };
 1652 */
 1653 {
 1654         struct inode *ip;
 1655         struct fs *fs;
 1656         uint32_t ealength, ul;
 1657         int ealen, olen, eacont, eapad1, eapad2, error, i, easize;
 1658         u_char *eae, *p;
 1659         struct ufs2_dinode *dp;
 1660         struct ucred *cred;
 1661         int stand_alone;
 1662 
 1663         ip = VTOI(ap->a_vp);
 1664         fs = ip->i_fs;
 1665 
 1666         if (fs->fs_magic == FS_UFS1_MAGIC)
 1667                 return (ufs_vnoperate((struct vop_generic_args *)ap));
 1668 
 1669         error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 1670             ap->a_cred, ap->a_td, IWRITE);
 1671         if (error) {
 1672                 if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
 1673                         ip->i_ea_error = error;
 1674                 return (error);
 1675         }
 1676 
 1677         if (ap->a_cred != NOCRED)
 1678                 cred = ap->a_cred;
 1679         else
 1680                 cred = ap->a_vp->v_mount->mnt_cred;
 1681 
 1682         dp = ip->i_din2;
 1683 
 1684         if (ip->i_ea_area == NULL) {
 1685                 error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
 1686                 if (error)
 1687                         return (error);
 1688                 stand_alone = 1;
 1689         } else {
 1690                 stand_alone = 0;
 1691         }
 1692 
 1693         /* Calculate the length of the EA entry */
 1694         if (ap->a_uio == NULL) {
 1695                 /* delete */
 1696                 ealength = eapad1 = ealen = eapad2 = eacont = 0;
 1697         } else {
 1698                 ealen = ap->a_uio->uio_resid;
 1699                 ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
 1700                 eapad1 = 8 - (ealength % 8);
 1701                 if (eapad1 == 8)
 1702                         eapad1 = 0;
 1703                 eacont = ealength + eapad1;
 1704                 eapad2 = 8 - (ealen % 8);
 1705                 if (eapad2 == 8)
 1706                         eapad2 = 0;
 1707                 ealength += eapad1 + ealen + eapad2;
 1708         }
 1709 
 1710         eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
 1711         bcopy(ip->i_ea_area, eae, ip->i_ea_len);
 1712         easize = ip->i_ea_len;
 1713 
 1714         olen = ffs_findextattr(eae, easize,
 1715             ap->a_attrnamespace, ap->a_name, &p, NULL);
 1716         if (olen == -1 && ealength == 0) {
 1717                 /* delete but nonexistent */
 1718                 free(eae, M_TEMP);
 1719                 if (stand_alone)
 1720                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 1721                 return(ENOATTR);
 1722         }
 1723         if (olen == -1) {
 1724                 /* new, append at end */
 1725                 p = eae + easize;
 1726                 easize += ealength;
 1727         } else {
 1728                 bcopy(p, &ul, sizeof ul);
 1729                 i = p - eae + ul;
 1730                 if (ul != ealength) {
 1731                         bcopy(p + ul, p + ealength, easize - i);
 1732                         easize += (ealength - ul);
 1733                 }
 1734         }
 1735         if (easize > NXADDR * fs->fs_bsize) {
 1736                 free(eae, M_TEMP);
 1737                 if (stand_alone)
 1738                         ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 1739                 else if (ip->i_ea_error == 0)
 1740                         ip->i_ea_error = ENOSPC;
 1741                 return(ENOSPC);
 1742         }
 1743         if (ealength != 0) {
 1744                 bcopy(&ealength, p, sizeof(ealength));
 1745                 p += sizeof(ealength);
 1746                 *p++ = ap->a_attrnamespace;
 1747                 *p++ = eapad2;
 1748                 *p++ = strlen(ap->a_name);
 1749                 strcpy(p, ap->a_name);
 1750                 p += strlen(ap->a_name);
 1751                 bzero(p, eapad1);
 1752                 p += eapad1;
 1753                 error = uiomove(p, ealen, ap->a_uio);
 1754                 if (error) {
 1755                         free(eae, M_TEMP);
 1756                         if (stand_alone)
 1757                                 ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
 1758                         else if (ip->i_ea_error == 0)
 1759                                 ip->i_ea_error = error;
 1760                         return(error);
 1761                 }
 1762                 p += ealen;
 1763                 bzero(p, eapad2);
 1764         }
 1765         p = ip->i_ea_area;
 1766         ip->i_ea_area = eae;
 1767         ip->i_ea_len = easize;
 1768         free(p, M_TEMP);
 1769         if (stand_alone)
 1770                 error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
 1771         return(error);
 1772 }
Cache object: b6615c3799c8653d66b2e81cf6f0d94d
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/ufs/ffs/ffs_vnops.c

FreeBSD/Linux Kernel Cross Reference
sys/ufs/ffs/ffs_vnops.c