The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/vfs/hammer/hammer_vnops.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
    3  * 
    4  * This code is derived from software contributed to The DragonFly Project
    5  * by Matthew Dillon <dillon@backplane.com>
    6  * 
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in
   15  *    the documentation and/or other materials provided with the
   16  *    distribution.
   17  * 3. Neither the name of The DragonFly Project nor the names of its
   18  *    contributors may be used to endorse or promote products derived
   19  *    from this software without specific, prior written permission.
   20  * 
   21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
   25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
   29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   32  * SUCH DAMAGE.
   33  */
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/kernel.h>
   38 #include <sys/fcntl.h>
   39 #include <sys/namecache.h>
   40 #include <sys/vnode.h>
   41 #include <sys/lockf.h>
   42 #include <sys/event.h>
   43 #include <sys/stat.h>
   44 #include <sys/dirent.h>
   45 #include <sys/file.h>
   46 #include <vm/vm_extern.h>
   47 #include <vm/swap_pager.h>
   48 #include <vfs/fifofs/fifo.h>
   49 
   50 #include "hammer.h"
   51 
   52 /*
   53  * USERFS VNOPS
   54  */
   55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
   56 static int hammer_vop_fsync(struct vop_fsync_args *);
   57 static int hammer_vop_read(struct vop_read_args *);
   58 static int hammer_vop_write(struct vop_write_args *);
   59 static int hammer_vop_access(struct vop_access_args *);
   60 static int hammer_vop_advlock(struct vop_advlock_args *);
   61 static int hammer_vop_close(struct vop_close_args *);
   62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
   63 static int hammer_vop_getattr(struct vop_getattr_args *);
   64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
   65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
   66 static int hammer_vop_nlink(struct vop_nlink_args *);
   67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
   68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
   69 static int hammer_vop_open(struct vop_open_args *);
   70 static int hammer_vop_print(struct vop_print_args *);
   71 static int hammer_vop_readdir(struct vop_readdir_args *);
   72 static int hammer_vop_readlink(struct vop_readlink_args *);
   73 static int hammer_vop_nremove(struct vop_nremove_args *);
   74 static int hammer_vop_nrename(struct vop_nrename_args *);
   75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
   76 static int hammer_vop_markatime(struct vop_markatime_args *);
   77 static int hammer_vop_setattr(struct vop_setattr_args *);
   78 static int hammer_vop_strategy(struct vop_strategy_args *);
   79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
   80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
   81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
   82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
   83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
   84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
   85 
   86 static int hammer_vop_fifoclose (struct vop_close_args *);
   87 static int hammer_vop_fiforead (struct vop_read_args *);
   88 static int hammer_vop_fifowrite (struct vop_write_args *);
   89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
   90 
   91 struct vop_ops hammer_vnode_vops = {
   92         .vop_default =          vop_defaultop,
   93         .vop_fsync =            hammer_vop_fsync,
   94         .vop_getpages =         vop_stdgetpages,
   95         .vop_putpages =         vop_stdputpages,
   96         .vop_read =             hammer_vop_read,
   97         .vop_write =            hammer_vop_write,
   98         .vop_access =           hammer_vop_access,
   99         .vop_advlock =          hammer_vop_advlock,
  100         .vop_close =            hammer_vop_close,
  101         .vop_ncreate =          hammer_vop_ncreate,
  102         .vop_getattr =          hammer_vop_getattr,
  103         .vop_inactive =         hammer_vop_inactive,
  104         .vop_reclaim =          hammer_vop_reclaim,
  105         .vop_nresolve =         hammer_vop_nresolve,
  106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
  107         .vop_nlink =            hammer_vop_nlink,
  108         .vop_nmkdir =           hammer_vop_nmkdir,
  109         .vop_nmknod =           hammer_vop_nmknod,
  110         .vop_open =             hammer_vop_open,
  111         .vop_pathconf =         vop_stdpathconf,
  112         .vop_print =            hammer_vop_print,
  113         .vop_readdir =          hammer_vop_readdir,
  114         .vop_readlink =         hammer_vop_readlink,
  115         .vop_nremove =          hammer_vop_nremove,
  116         .vop_nrename =          hammer_vop_nrename,
  117         .vop_nrmdir =           hammer_vop_nrmdir,
  118         .vop_markatime =        hammer_vop_markatime,
  119         .vop_setattr =          hammer_vop_setattr,
  120         .vop_bmap =             hammer_vop_bmap,
  121         .vop_strategy =         hammer_vop_strategy,
  122         .vop_nsymlink =         hammer_vop_nsymlink,
  123         .vop_nwhiteout =        hammer_vop_nwhiteout,
  124         .vop_ioctl =            hammer_vop_ioctl,
  125         .vop_mountctl =         hammer_vop_mountctl,
  126         .vop_kqfilter =         hammer_vop_kqfilter
  127 };
  128 
  129 struct vop_ops hammer_spec_vops = {
  130         .vop_default =          vop_defaultop,
  131         .vop_fsync =            hammer_vop_fsync,
  132         .vop_read =             vop_stdnoread,
  133         .vop_write =            vop_stdnowrite,
  134         .vop_access =           hammer_vop_access,
  135         .vop_close =            hammer_vop_close,
  136         .vop_markatime =        hammer_vop_markatime,
  137         .vop_getattr =          hammer_vop_getattr,
  138         .vop_inactive =         hammer_vop_inactive,
  139         .vop_reclaim =          hammer_vop_reclaim,
  140         .vop_setattr =          hammer_vop_setattr
  141 };
  142 
  143 struct vop_ops hammer_fifo_vops = {
  144         .vop_default =          fifo_vnoperate,
  145         .vop_fsync =            hammer_vop_fsync,
  146         .vop_read =             hammer_vop_fiforead,
  147         .vop_write =            hammer_vop_fifowrite,
  148         .vop_access =           hammer_vop_access,
  149         .vop_close =            hammer_vop_fifoclose,
  150         .vop_markatime =        hammer_vop_markatime,
  151         .vop_getattr =          hammer_vop_getattr,
  152         .vop_inactive =         hammer_vop_inactive,
  153         .vop_reclaim =          hammer_vop_reclaim,
  154         .vop_setattr =          hammer_vop_setattr,
  155         .vop_kqfilter =         hammer_vop_fifokqfilter
  156 };
  157 
  158 static __inline
  159 void
  160 hammer_knote(struct vnode *vp, int flags)
  161 {
  162         if (flags)
  163                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
  164 }
  165 
  166 #ifdef DEBUG_TRUNCATE
  167 struct hammer_inode *HammerTruncIp;
  168 #endif
  169 
  170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
  171                            struct vnode *dvp, struct ucred *cred,
  172                            int flags, int isdir);
  173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
  174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
  175 
  176 #if 0
  177 static
  178 int
  179 hammer_vop_vnoperate(struct vop_generic_args *)
  180 {
  181         return (VOCALL(&hammer_vnode_vops, ap));
  182 }
  183 #endif
  184 
  185 /*
  186  * hammer_vop_fsync { vp, waitfor }
  187  *
  188  * fsync() an inode to disk and wait for it to be completely committed
  189  * such that the information would not be undone if a crash occured after
  190  * return.
  191  *
  192  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
  193  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
  194  *       operation.
  195  *
  196  *       Ultimately the combination of a REDO log and use of fast storage
  197  *       to front-end cluster caches will make fsync fast, but it aint
  198  *       here yet.  And, in anycase, we need real transactional
  199  *       all-or-nothing features which are not restricted to a single file.
  200  */
  201 static
  202 int
  203 hammer_vop_fsync(struct vop_fsync_args *ap)
  204 {
  205         hammer_inode_t ip = VTOI(ap->a_vp);
  206         hammer_mount_t hmp = ip->hmp;
  207         int waitfor = ap->a_waitfor;
  208         int mode;
  209 
  210         lwkt_gettoken(&hmp->fs_token);
  211 
  212         /*
  213          * Fsync rule relaxation (default is either full synchronous flush
  214          * or REDO semantics with synchronous flush).
  215          */
  216         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
  217                 switch(hammer_fsync_mode) {
  218                 case 0:
  219 mode0:
  220                         /* no REDO, full synchronous flush */
  221                         goto skip;
  222                 case 1:
  223 mode1:
  224                         /* no REDO, full asynchronous flush */
  225                         if (waitfor == MNT_WAIT)
  226                                 waitfor = MNT_NOWAIT;
  227                         goto skip;
  228                 case 2:
  229                         /* REDO semantics, synchronous flush */
  230                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
  231                                 goto mode0;
  232                         mode = HAMMER_FLUSH_UNDOS_AUTO;
  233                         break;
  234                 case 3:
  235                         /* REDO semantics, relaxed asynchronous flush */
  236                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
  237                                 goto mode1;
  238                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
  239                         if (waitfor == MNT_WAIT)
  240                                 waitfor = MNT_NOWAIT;
  241                         break;
  242                 case 4:
  243                         /* ignore the fsync() system call */
  244                         lwkt_reltoken(&hmp->fs_token);
  245                         return(0);
  246                 default:
  247                         /* we have to do something */
  248                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
  249                         if (waitfor == MNT_WAIT)
  250                                 waitfor = MNT_NOWAIT;
  251                         break;
  252                 }
  253 
  254                 /*
  255                  * Fast fsync only needs to flush the UNDO/REDO fifo if
  256                  * HAMMER_INODE_REDO is non-zero and the only modifications
  257                  * made to the file are write or write-extends.
  258                  */
  259                 if ((ip->flags & HAMMER_INODE_REDO) &&
  260                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
  261                 ) {
  262                         ++hammer_count_fsyncs;
  263                         hammer_flusher_flush_undos(hmp, mode);
  264                         ip->redo_count = 0;
  265                         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
  266                                 vclrisdirty(ip->vp);
  267                         lwkt_reltoken(&hmp->fs_token);
  268                         return(0);
  269                 }
  270 
  271                 /*
  272                  * REDO is enabled by fsync(), the idea being we really only
  273                  * want to lay down REDO records when programs are using
  274                  * fsync() heavily.  The first fsync() on the file starts
  275                  * the gravy train going and later fsync()s keep it hot by
  276                  * resetting the redo_count.
  277                  *
  278                  * We weren't running REDOs before now so we have to fall
  279                  * through and do a full fsync of what we have.
  280                  */
  281                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
  282                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
  283                         ip->flags |= HAMMER_INODE_REDO;
  284                         ip->redo_count = 0;
  285                 }
  286         }
  287 skip:
  288 
  289         /*
  290          * Do a full flush sequence.
  291          *
  292          * Attempt to release the vnode while waiting for the inode to
  293          * finish flushing.  This can really mess up inactive->reclaim
  294          * sequences so only do it if the vnode is active.
  295          *
  296          * WARNING! The VX lock functions must be used.  vn_lock() will
  297          *          fail when this is part of a VOP_RECLAIM sequence.
  298          */
  299         ++hammer_count_fsyncs;
  300         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
  301         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
  302         if (waitfor == MNT_WAIT) {
  303                 int dorelock;
  304 
  305                 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) {
  306                         vx_unlock(ap->a_vp);
  307                         dorelock = 1;
  308                 } else {
  309                         dorelock = 0;
  310                 }
  311                 hammer_wait_inode(ip);
  312                 if (dorelock)
  313                         vx_lock(ap->a_vp);
  314         }
  315         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
  316                 vclrisdirty(ip->vp);
  317         lwkt_reltoken(&hmp->fs_token);
  318         return (ip->error);
  319 }
  320 
  321 /*
  322  * hammer_vop_read { vp, uio, ioflag, cred }
  323  *
  324  * MPSAFE (for the cache safe does not require fs_token)
  325  */
  326 static
  327 int
  328 hammer_vop_read(struct vop_read_args *ap)
  329 {
  330         struct hammer_transaction trans;
  331         hammer_inode_t ip;
  332         hammer_mount_t hmp;
  333         off_t offset;
  334         struct buf *bp;
  335         struct uio *uio;
  336         int error;
  337         int n;
  338         int seqcount;
  339         int ioseqcount;
  340         int blksize;
  341         int bigread;
  342         int got_trans;
  343         size_t resid;
  344 
  345         if (ap->a_vp->v_type != VREG)
  346                 return (EINVAL);
  347         ip = VTOI(ap->a_vp);
  348         hmp = ip->hmp;
  349         error = 0;
  350         got_trans = 0;
  351         uio = ap->a_uio;
  352 
  353         /*
  354          * Attempt to shortcut directly to the VM object using lwbufs.
  355          * This is much faster than instantiating buffer cache buffers.
  356          */
  357         resid = uio->uio_resid;
  358         error = vop_helper_read_shortcut(ap);
  359         hammer_stats_file_read += resid - uio->uio_resid;
  360         if (error)
  361                 return (error);
  362         if (uio->uio_resid == 0)
  363                 goto finished;
  364 
  365         /*
  366          * Allow the UIO's size to override the sequential heuristic.
  367          */
  368         blksize = hammer_blocksize(uio->uio_offset);
  369         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
  370         ioseqcount = (ap->a_ioflag >> 16);
  371         if (seqcount < ioseqcount)
  372                 seqcount = ioseqcount;
  373 
  374         /*
  375          * If reading or writing a huge amount of data we have to break
  376          * atomicy and allow the operation to be interrupted by a signal
  377          * or it can DOS the machine.
  378          */
  379         bigread = (uio->uio_resid > 100 * 1024 * 1024);
  380 
  381         /*
  382          * Access the data typically in HAMMER_BUFSIZE blocks via the
  383          * buffer cache, but HAMMER may use a variable block size based
  384          * on the offset.
  385          *
  386          * XXX Temporary hack, delay the start transaction while we remain
  387          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
  388          *     locked-shared.
  389          */
  390         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
  391                 int64_t base_offset;
  392                 int64_t file_limit;
  393 
  394                 blksize = hammer_blocksize(uio->uio_offset);
  395                 offset = (int)uio->uio_offset & (blksize - 1);
  396                 base_offset = uio->uio_offset - offset;
  397 
  398                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
  399                         break;
  400 
  401                 /*
  402                  * MPSAFE
  403                  */
  404                 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
  405                 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
  406                         bp->b_flags &= ~B_AGE;
  407                         error = 0;
  408                         goto skip;
  409                 }
  410                 if (ap->a_ioflag & IO_NRDELAY) {
  411                         bqrelse(bp);
  412                         return (EWOULDBLOCK);
  413                 }
  414 
  415                 /*
  416                  * MPUNSAFE
  417                  */
  418                 if (got_trans == 0) {
  419                         hammer_start_transaction(&trans, ip->hmp);
  420                         got_trans = 1;
  421                 }
  422 
  423                 /*
  424                  * NOTE: A valid bp has already been acquired, but was not
  425                  *       B_CACHE.
  426                  */
  427                 if (hammer_cluster_enable) {
  428                         /*
  429                          * Use file_limit to prevent cluster_read() from
  430                          * creating buffers of the wrong block size past
  431                          * the demarc.
  432                          */
  433                         file_limit = ip->ino_data.size;
  434                         if (base_offset < HAMMER_XDEMARC &&
  435                             file_limit > HAMMER_XDEMARC) {
  436                                 file_limit = HAMMER_XDEMARC;
  437                         }
  438                         error = cluster_readx(ap->a_vp,
  439                                              file_limit, base_offset,
  440                                              blksize, uio->uio_resid,
  441                                              seqcount * BKVASIZE, &bp);
  442                 } else {
  443                         error = breadnx(ap->a_vp, base_offset, blksize,
  444                                         NULL, NULL, 0, &bp);
  445                 }
  446                 if (error) {
  447                         brelse(bp);
  448                         break;
  449                 }
  450 skip:
  451                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
  452                         kprintf("doff %016jx read file %016jx@%016jx\n",
  453                                 (intmax_t)bp->b_bio2.bio_offset,
  454                                 (intmax_t)ip->obj_id,
  455                                 (intmax_t)bp->b_loffset);
  456                 }
  457                 bp->b_flags &= ~B_IODEBUG;
  458                 if (blksize == HAMMER_XBUFSIZE)
  459                         bp->b_flags |= B_CLUSTEROK;
  460 
  461                 n = blksize - offset;
  462                 if (n > uio->uio_resid)
  463                         n = uio->uio_resid;
  464                 if (n > ip->ino_data.size - uio->uio_offset)
  465                         n = (int)(ip->ino_data.size - uio->uio_offset);
  466 
  467                 /*
  468                  * Set B_AGE, data has a lower priority than meta-data.
  469                  *
  470                  * Use a hold/unlock/drop sequence to run the uiomove
  471                  * with the buffer unlocked, avoiding deadlocks against
  472                  * read()s on mmap()'d spaces.
  473                  */
  474                 bp->b_flags |= B_AGE;
  475                 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
  476                 bqrelse(bp);
  477 
  478                 if (error)
  479                         break;
  480                 hammer_stats_file_read += n;
  481         }
  482 
  483 finished:
  484 
  485         /*
  486          * Try to update the atime with just the inode lock for maximum
  487          * concurrency.  If we can't shortcut it we have to get the full
  488          * blown transaction.
  489          */
  490         if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) {
  491                 hammer_start_transaction(&trans, ip->hmp);
  492                 got_trans = 1;
  493         }
  494 
  495         if (got_trans) {
  496                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
  497                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
  498                         lwkt_gettoken(&hmp->fs_token);
  499                         ip->ino_data.atime = trans.time;
  500                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
  501                         hammer_done_transaction(&trans);
  502                         lwkt_reltoken(&hmp->fs_token);
  503                 } else {
  504                         hammer_done_transaction(&trans);
  505                 }
  506         }
  507         return (error);
  508 }
  509 
  510 /*
  511  * hammer_vop_write { vp, uio, ioflag, cred }
  512  */
  513 static
  514 int
  515 hammer_vop_write(struct vop_write_args *ap)
  516 {
  517         struct hammer_transaction trans;
  518         struct hammer_inode *ip;
  519         hammer_mount_t hmp;
  520         thread_t td;
  521         struct uio *uio;
  522         int offset;
  523         off_t base_offset;
  524         int64_t cluster_eof;
  525         struct buf *bp;
  526         int kflags;
  527         int error;
  528         int n;
  529         int flags;
  530         int seqcount;
  531         int bigwrite;
  532 
  533         if (ap->a_vp->v_type != VREG)
  534                 return (EINVAL);
  535         ip = VTOI(ap->a_vp);
  536         hmp = ip->hmp;
  537         error = 0;
  538         kflags = 0;
  539         seqcount = ap->a_ioflag >> 16;
  540 
  541         if (ip->flags & HAMMER_INODE_RO)
  542                 return (EROFS);
  543 
  544         /*
  545          * Create a transaction to cover the operations we perform.
  546          */
  547         hammer_start_transaction(&trans, hmp);
  548         uio = ap->a_uio;
  549 
  550         /*
  551          * Check append mode
  552          */
  553         if (ap->a_ioflag & IO_APPEND)
  554                 uio->uio_offset = ip->ino_data.size;
  555 
  556         /*
  557          * Check for illegal write offsets.  Valid range is 0...2^63-1.
  558          *
  559          * NOTE: the base_off assignment is required to work around what
  560          * I consider to be a GCC-4 optimization bug.
  561          */
  562         if (uio->uio_offset < 0) {
  563                 hammer_done_transaction(&trans);
  564                 return (EFBIG);
  565         }
  566         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
  567         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
  568                 hammer_done_transaction(&trans);
  569                 return (EFBIG);
  570         }
  571 
  572         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
  573             base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
  574                 hammer_done_transaction(&trans);
  575                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
  576                 return (EFBIG);
  577         }
  578 
  579         /*
  580          * If reading or writing a huge amount of data we have to break
  581          * atomicy and allow the operation to be interrupted by a signal
  582          * or it can DOS the machine.
  583          *
  584          * Preset redo_count so we stop generating REDOs earlier if the
  585          * limit is exceeded.
  586          *
  587          * redo_count is heuristical, SMP races are ok
  588          */
  589         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
  590         if ((ip->flags & HAMMER_INODE_REDO) &&
  591             ip->redo_count < hammer_limit_redo) {
  592                 ip->redo_count += uio->uio_resid;
  593         }
  594 
  595         /*
  596          * Access the data typically in HAMMER_BUFSIZE blocks via the
  597          * buffer cache, but HAMMER may use a variable block size based
  598          * on the offset.
  599          */
  600         while (uio->uio_resid > 0) {
  601                 int fixsize = 0;
  602                 int blksize;
  603                 int blkmask;
  604                 int trivial;
  605                 int endofblk;
  606                 off_t nsize;
  607 
  608                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
  609                         break;
  610                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
  611                         break;
  612 
  613                 blksize = hammer_blocksize(uio->uio_offset);
  614 
  615                 /*
  616                  * Control the number of pending records associated with
  617                  * this inode.  If too many have accumulated start a
  618                  * flush.  Try to maintain a pipeline with the flusher.
  619                  *
  620                  * NOTE: It is possible for other sources to grow the
  621                  *       records but not necessarily issue another flush,
  622                  *       so use a timeout and ensure that a re-flush occurs.
  623                  */
  624                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
  625                         lwkt_gettoken(&hmp->fs_token);
  626                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
  627                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
  628                                 ip->flags |= HAMMER_INODE_RECSW;
  629                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
  630                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
  631                         }
  632                         lwkt_reltoken(&hmp->fs_token);
  633                 }
  634 
  635                 /*
  636                  * Do not allow HAMMER to blow out the buffer cache.  Very
  637                  * large UIOs can lockout other processes due to bwillwrite()
  638                  * mechanics.
  639                  *
  640                  * The hammer inode is not locked during these operations.
  641                  * The vnode is locked which can interfere with the pageout
  642                  * daemon for non-UIO_NOCOPY writes but should not interfere
  643                  * with the buffer cache.  Even so, we cannot afford to
  644                  * allow the pageout daemon to build up too many dirty buffer
  645                  * cache buffers.
  646                  *
  647                  * Only call this if we aren't being recursively called from
  648                  * a virtual disk device (vn), else we may deadlock.
  649                  */
  650                 if ((ap->a_ioflag & IO_RECURSE) == 0)
  651                         bwillwrite(blksize);
  652 
  653                 /*
  654                  * Calculate the blocksize at the current offset and figure
  655                  * out how much we can actually write.
  656                  */
  657                 blkmask = blksize - 1;
  658                 offset = (int)uio->uio_offset & blkmask;
  659                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
  660                 n = blksize - offset;
  661                 if (n > uio->uio_resid) {
  662                         n = uio->uio_resid;
  663                         endofblk = 0;
  664                 } else {
  665                         endofblk = 1;
  666                 }
  667                 nsize = uio->uio_offset + n;
  668                 if (nsize > ip->ino_data.size) {
  669                         if (uio->uio_offset > ip->ino_data.size)
  670                                 trivial = 0;
  671                         else
  672                                 trivial = 1;
  673                         nvextendbuf(ap->a_vp,
  674                                     ip->ino_data.size,
  675                                     nsize,
  676                                     hammer_blocksize(ip->ino_data.size),
  677                                     hammer_blocksize(nsize),
  678                                     hammer_blockoff(ip->ino_data.size),
  679                                     hammer_blockoff(nsize),
  680                                     trivial);
  681                         fixsize = 1;
  682                         kflags |= NOTE_EXTEND;
  683                 }
  684 
  685                 if (uio->uio_segflg == UIO_NOCOPY) {
  686                         /*
  687                          * Issuing a write with the same data backing the
  688                          * buffer.  Instantiate the buffer to collect the
  689                          * backing vm pages, then read-in any missing bits.
  690                          *
  691                          * This case is used by vop_stdputpages().
  692                          */
  693                         bp = getblk(ap->a_vp, base_offset,
  694                                     blksize, GETBLK_BHEAVY, 0);
  695                         if ((bp->b_flags & B_CACHE) == 0) {
  696                                 bqrelse(bp);
  697                                 error = bread(ap->a_vp, base_offset,
  698                                               blksize, &bp);
  699                         }
  700                 } else if (offset == 0 && uio->uio_resid >= blksize) {
  701                         /*
  702                          * Even though we are entirely overwriting the buffer
  703                          * we may still have to zero it out to avoid a 
  704                          * mmap/write visibility issue.
  705                          */
  706                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
  707                         if ((bp->b_flags & B_CACHE) == 0)
  708                                 vfs_bio_clrbuf(bp);
  709                 } else if (base_offset >= ip->ino_data.size) {
  710                         /*
  711                          * If the base offset of the buffer is beyond the
  712                          * file EOF, we don't have to issue a read.
  713                          */
  714                         bp = getblk(ap->a_vp, base_offset,
  715                                     blksize, GETBLK_BHEAVY, 0);
  716                         vfs_bio_clrbuf(bp);
  717                 } else {
  718                         /*
  719                          * Partial overwrite, read in any missing bits then
  720                          * replace the portion being written.
  721                          */
  722                         error = bread(ap->a_vp, base_offset, blksize, &bp);
  723                         if (error == 0)
  724                                 bheavy(bp);
  725                 }
  726                 if (error == 0)
  727                         error = uiomovebp(bp, bp->b_data + offset, n, uio);
  728 
  729                 lwkt_gettoken(&hmp->fs_token);
  730 
  731                 /*
  732                  * Generate REDO records if enabled and redo_count will not
  733                  * exceeded the limit.
  734                  *
  735                  * If redo_count exceeds the limit we stop generating records
  736                  * and clear HAMMER_INODE_REDO.  This will cause the next
  737                  * fsync() to do a full meta-data sync instead of just an
  738                  * UNDO/REDO fifo update.
  739                  *
  740                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
  741                  * will still be tracked.  The tracks will be terminated
  742                  * when the related meta-data (including possible data
  743                  * modifications which are not tracked via REDO) is
  744                  * flushed.
  745                  */
  746                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
  747                         if (ip->redo_count < hammer_limit_redo) {
  748                                 bp->b_flags |= B_VFSFLAG1;
  749                                 error = hammer_generate_redo(&trans, ip,
  750                                                      base_offset + offset,
  751                                                      HAMMER_REDO_WRITE,
  752                                                      bp->b_data + offset,
  753                                                      (size_t)n);
  754                         } else {
  755                                 ip->flags &= ~HAMMER_INODE_REDO;
  756                         }
  757                 }
  758 
  759                 /*
  760                  * If we screwed up we have to undo any VM size changes we
  761                  * made.
  762                  */
  763                 if (error) {
  764                         brelse(bp);
  765                         if (fixsize) {
  766                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
  767                                           hammer_blocksize(ip->ino_data.size),
  768                                           hammer_blockoff(ip->ino_data.size),
  769                                           0);
  770                         }
  771                         break;
  772                 }
  773                 kflags |= NOTE_WRITE;
  774                 hammer_stats_file_write += n;
  775                 if (blksize == HAMMER_XBUFSIZE)
  776                         bp->b_flags |= B_CLUSTEROK;
  777                 if (ip->ino_data.size < uio->uio_offset) {
  778                         ip->ino_data.size = uio->uio_offset;
  779                         flags = HAMMER_INODE_SDIRTY;
  780                 } else {
  781                         flags = 0;
  782                 }
  783                 ip->ino_data.mtime = trans.time;
  784                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
  785                 hammer_modify_inode(&trans, ip, flags);
  786 
  787                 /*
  788                  * Once we dirty the buffer any cached zone-X offset
  789                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
  790                  * allow overwriting over the same data sector unless
  791                  * we provide UNDOs for the old data, which we don't.
  792                  */
  793                 bp->b_bio2.bio_offset = NOOFFSET;
  794 
  795                 lwkt_reltoken(&hmp->fs_token);
  796 
  797                 /*
  798                  * Final buffer disposition.
  799                  *
  800                  * Because meta-data updates are deferred, HAMMER is
  801                  * especially sensitive to excessive bdwrite()s because
  802                  * the I/O stream is not broken up by disk reads.  So the
  803                  * buffer cache simply cannot keep up.
  804                  *
  805                  * WARNING!  blksize is variable.  cluster_write() is
  806                  *           expected to not blow up if it encounters
  807                  *           buffers that do not match the passed blksize.
  808                  *
  809                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
  810                  *        The ip->rsv_recs check should burst-flush the data.
  811                  *        If we queue it immediately the buf could be left
  812                  *        locked on the device queue for a very long time.
  813                  *
  814                  *        However, failing to flush a dirty buffer out when
  815                  *        issued from the pageout daemon can result in a low
  816                  *        memory deadlock against bio_page_alloc(), so we
  817                  *        have to bawrite() on IO_ASYNC as well.
  818                  *
  819                  * NOTE!  To avoid degenerate stalls due to mismatched block
  820                  *        sizes we only honor IO_DIRECT on the write which
  821                  *        abuts the end of the buffer.  However, we must
  822                  *        honor IO_SYNC in case someone is silly enough to
  823                  *        configure a HAMMER file as swap, or when HAMMER
  824                  *        is serving NFS (for commits).  Ick ick.
  825                  */
  826                 bp->b_flags |= B_AGE;
  827                 if (blksize == HAMMER_XBUFSIZE)
  828                         bp->b_flags |= B_CLUSTEROK;
  829 
  830                 if (ap->a_ioflag & IO_SYNC) {
  831                         bwrite(bp);
  832                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
  833                         bawrite(bp);
  834                 } else if (ap->a_ioflag & IO_ASYNC) {
  835                         bawrite(bp);
  836                 } else if (hammer_cluster_enable &&
  837                            !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
  838                         if (base_offset < HAMMER_XDEMARC)
  839                                 cluster_eof = hammer_blockdemarc(base_offset,
  840                                                          ip->ino_data.size);
  841                         else
  842                                 cluster_eof = ip->ino_data.size;
  843                         cluster_write(bp, cluster_eof, blksize, seqcount);
  844                 } else {
  845                         bdwrite(bp);
  846                 }
  847         }
  848         hammer_done_transaction(&trans);
  849         hammer_knote(ap->a_vp, kflags);
  850 
  851         return (error);
  852 }
  853 
  854 /*
  855  * hammer_vop_access { vp, mode, cred }
  856  *
  857  * MPSAFE - does not require fs_token
  858  */
  859 static
  860 int
  861 hammer_vop_access(struct vop_access_args *ap)
  862 {
  863         struct hammer_inode *ip = VTOI(ap->a_vp);
  864         uid_t uid;
  865         gid_t gid;
  866         int error;
  867 
  868         ++hammer_stats_file_iopsr;
  869         uid = hammer_to_unix_xid(&ip->ino_data.uid);
  870         gid = hammer_to_unix_xid(&ip->ino_data.gid);
  871 
  872         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
  873                                   ip->ino_data.uflags);
  874         return (error);
  875 }
  876 
  877 /*
  878  * hammer_vop_advlock { vp, id, op, fl, flags }
  879  *
  880  * MPSAFE - does not require fs_token
  881  */
  882 static
  883 int
  884 hammer_vop_advlock(struct vop_advlock_args *ap)
  885 {
  886         hammer_inode_t ip = VTOI(ap->a_vp);
  887 
  888         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
  889 }
  890 
  891 /*
  892  * hammer_vop_close { vp, fflag }
  893  *
  894  * We can only sync-on-close for normal closes.  XXX disabled for now.
  895  */
  896 static
  897 int
  898 hammer_vop_close(struct vop_close_args *ap)
  899 {
  900 #if 0
  901         struct vnode *vp = ap->a_vp;
  902         hammer_inode_t ip = VTOI(vp);
  903         int waitfor;
  904         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
  905                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
  906                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
  907                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
  908                                 waitfor = MNT_WAIT;
  909                         else
  910                                 waitfor = MNT_NOWAIT;
  911                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
  912                                        HAMMER_INODE_CLOSEASYNC);
  913                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
  914                 }
  915         }
  916 #endif
  917         return (vop_stdclose(ap));
  918 }
  919 
  920 /*
  921  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
  922  *
  923  * The operating system has already ensured that the directory entry
  924  * does not exist and done all appropriate namespace locking.
  925  */
  926 static
  927 int
  928 hammer_vop_ncreate(struct vop_ncreate_args *ap)
  929 {
  930         struct hammer_transaction trans;
  931         struct hammer_inode *dip;
  932         struct hammer_inode *nip;
  933         struct nchandle *nch;
  934         hammer_mount_t hmp;
  935         int error;
  936 
  937         nch = ap->a_nch;
  938         dip = VTOI(ap->a_dvp);
  939         hmp = dip->hmp;
  940 
  941         if (dip->flags & HAMMER_INODE_RO)
  942                 return (EROFS);
  943         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
  944                 return (error);
  945 
  946         /*
  947          * Create a transaction to cover the operations we perform.
  948          */
  949         lwkt_gettoken(&hmp->fs_token);
  950         hammer_start_transaction(&trans, hmp);
  951         ++hammer_stats_file_iopsw;
  952 
  953         /*
  954          * Create a new filesystem object of the requested type.  The
  955          * returned inode will be referenced and shared-locked to prevent
  956          * it from being moved to the flusher.
  957          */
  958         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
  959                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
  960                                     NULL, &nip);
  961         if (error) {
  962                 hkprintf("hammer_create_inode error %d\n", error);
  963                 hammer_done_transaction(&trans);
  964                 *ap->a_vpp = NULL;
  965                 lwkt_reltoken(&hmp->fs_token);
  966                 return (error);
  967         }
  968 
  969         /*
  970          * Add the new filesystem object to the directory.  This will also
  971          * bump the inode's link count.
  972          */
  973         error = hammer_ip_add_directory(&trans, dip,
  974                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
  975                                         nip);
  976         if (error)
  977                 hkprintf("hammer_ip_add_directory error %d\n", error);
  978 
  979         /*
  980          * Finish up.
  981          */
  982         if (error) {
  983                 hammer_rel_inode(nip, 0);
  984                 hammer_done_transaction(&trans);
  985                 *ap->a_vpp = NULL;
  986         } else {
  987                 error = hammer_get_vnode(nip, ap->a_vpp);
  988                 hammer_done_transaction(&trans);
  989                 hammer_rel_inode(nip, 0);
  990                 if (error == 0) {
  991                         cache_setunresolved(ap->a_nch);
  992                         cache_setvp(ap->a_nch, *ap->a_vpp);
  993                 }
  994                 hammer_knote(ap->a_dvp, NOTE_WRITE);
  995         }
  996         lwkt_reltoken(&hmp->fs_token);
  997         return (error);
  998 }
  999 
 1000 /*
 1001  * hammer_vop_getattr { vp, vap }
 1002  *
 1003  * Retrieve an inode's attribute information.  When accessing inodes
 1004  * historically we fake the atime field to ensure consistent results.
 1005  * The atime field is stored in the B-Tree element and allowed to be
 1006  * updated without cycling the element.
 1007  *
 1008  * MPSAFE - does not require fs_token
 1009  */
 1010 static
 1011 int
 1012 hammer_vop_getattr(struct vop_getattr_args *ap)
 1013 {
 1014         struct hammer_inode *ip = VTOI(ap->a_vp);
 1015         struct vattr *vap = ap->a_vap;
 1016 
 1017         /*
 1018          * We want the fsid to be different when accessing a filesystem
 1019          * with different as-of's so programs like diff don't think
 1020          * the files are the same.
 1021          *
 1022          * We also want the fsid to be the same when comparing snapshots,
 1023          * or when comparing mirrors (which might be backed by different
 1024          * physical devices).  HAMMER fsids are based on the PFS's
 1025          * shared_uuid field.
 1026          *
 1027          * XXX there is a chance of collision here.  The va_fsid reported
 1028          * by stat is different from the more involved fsid used in the
 1029          * mount structure.
 1030          */
 1031         ++hammer_stats_file_iopsr;
 1032         hammer_lock_sh(&ip->lock);
 1033         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
 1034                        (u_int32_t)(ip->obj_asof >> 32);
 1035 
 1036         vap->va_fileid = ip->ino_leaf.base.obj_id;
 1037         vap->va_mode = ip->ino_data.mode;
 1038         vap->va_nlink = ip->ino_data.nlinks;
 1039         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
 1040         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
 1041         vap->va_rmajor = 0;
 1042         vap->va_rminor = 0;
 1043         vap->va_size = ip->ino_data.size;
 1044 
 1045         /*
 1046          * Special case for @@PFS softlinks.  The actual size of the
 1047          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
 1048          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
 1049          */
 1050         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
 1051             ip->ino_data.size == 10 &&
 1052             ip->obj_asof == HAMMER_MAX_TID &&
 1053             ip->obj_localization == 0 &&
 1054             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
 1055                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
 1056                             vap->va_size = 26;
 1057                     else
 1058                             vap->va_size = 10;
 1059         }
 1060 
 1061         /*
 1062          * We must provide a consistent atime and mtime for snapshots
 1063          * so people can do a 'tar cf - ... | md5' on them and get
 1064          * consistent results.
 1065          */
 1066         if (ip->flags & HAMMER_INODE_RO) {
 1067                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
 1068                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
 1069         } else {
 1070                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
 1071                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
 1072         }
 1073         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
 1074         vap->va_flags = ip->ino_data.uflags;
 1075         vap->va_gen = 1;        /* hammer inums are unique for all time */
 1076         vap->va_blocksize = HAMMER_BUFSIZE;
 1077         if (ip->ino_data.size >= HAMMER_XDEMARC) {
 1078                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
 1079                                 ~HAMMER_XBUFMASK64;
 1080         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
 1081                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
 1082                                 ~HAMMER_BUFMASK64;
 1083         } else {
 1084                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
 1085         }
 1086 
 1087         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
 1088         vap->va_filerev = 0;    /* XXX */
 1089         vap->va_uid_uuid = ip->ino_data.uid;
 1090         vap->va_gid_uuid = ip->ino_data.gid;
 1091         vap->va_fsid_uuid = ip->hmp->fsid;
 1092         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
 1093                           VA_FSID_UUID_VALID;
 1094 
 1095         switch (ip->ino_data.obj_type) {
 1096         case HAMMER_OBJTYPE_CDEV:
 1097         case HAMMER_OBJTYPE_BDEV:
 1098                 vap->va_rmajor = ip->ino_data.rmajor;
 1099                 vap->va_rminor = ip->ino_data.rminor;
 1100                 break;
 1101         default:
 1102                 break;
 1103         }
 1104         hammer_unlock(&ip->lock);
 1105         return(0);
 1106 }
 1107 
 1108 /*
 1109  * hammer_vop_nresolve { nch, dvp, cred }
 1110  *
 1111  * Locate the requested directory entry.
 1112  */
 1113 static
 1114 int
 1115 hammer_vop_nresolve(struct vop_nresolve_args *ap)
 1116 {
 1117         struct hammer_transaction trans;
 1118         struct namecache *ncp;
 1119         hammer_mount_t hmp;
 1120         hammer_inode_t dip;
 1121         hammer_inode_t ip;
 1122         hammer_tid_t asof;
 1123         struct hammer_cursor cursor;
 1124         struct vnode *vp;
 1125         int64_t namekey;
 1126         int error;
 1127         int i;
 1128         int nlen;
 1129         int flags;
 1130         int ispfs;
 1131         int64_t obj_id;
 1132         u_int32_t localization;
 1133         u_int32_t max_iterations;
 1134 
 1135         /*
 1136          * Misc initialization, plus handle as-of name extensions.  Look for
 1137          * the '@@' extension.  Note that as-of files and directories cannot
 1138          * be modified.
 1139          */
 1140         dip = VTOI(ap->a_dvp);
 1141         ncp = ap->a_nch->ncp;
 1142         asof = dip->obj_asof;
 1143         localization = dip->obj_localization;   /* for code consistency */
 1144         nlen = ncp->nc_nlen;
 1145         flags = dip->flags & HAMMER_INODE_RO;
 1146         ispfs = 0;
 1147         hmp = dip->hmp;
 1148 
 1149         lwkt_gettoken(&hmp->fs_token);
 1150         hammer_simple_transaction(&trans, hmp);
 1151         ++hammer_stats_file_iopsr;
 1152 
 1153         for (i = 0; i < nlen; ++i) {
 1154                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
 1155                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
 1156                                                   &ispfs, &asof, &localization);
 1157                         if (error != 0) {
 1158                                 i = nlen;
 1159                                 break;
 1160                         }
 1161                         if (asof != HAMMER_MAX_TID)
 1162                                 flags |= HAMMER_INODE_RO;
 1163                         break;
 1164                 }
 1165         }
 1166         nlen = i;
 1167 
 1168         /*
 1169          * If this is a PFS softlink we dive into the PFS
 1170          */
 1171         if (ispfs && nlen == 0) {
 1172                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
 1173                                       asof, localization,
 1174                                       flags, &error);
 1175                 if (error == 0) {
 1176                         error = hammer_get_vnode(ip, &vp);
 1177                         hammer_rel_inode(ip, 0);
 1178                 } else {
 1179                         vp = NULL;
 1180                 }
 1181                 if (error == 0) {
 1182                         vn_unlock(vp);
 1183                         cache_setvp(ap->a_nch, vp);
 1184                         vrele(vp);
 1185                 }
 1186                 goto done;
 1187         }
 1188 
 1189         /*
 1190          * If there is no path component the time extension is relative to dip.
 1191          * e.g. "fubar/@@<snapshot>"
 1192          *
 1193          * "." is handled by the kernel, but ".@@<snapshot>" is not.
 1194          * e.g. "fubar/.@@<snapshot>"
 1195          *
 1196          * ".." is handled by the kernel.  We do not currently handle
 1197          * "..@<snapshot>".
 1198          */
 1199         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
 1200                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
 1201                                       asof, dip->obj_localization,
 1202                                       flags, &error);
 1203                 if (error == 0) {
 1204                         error = hammer_get_vnode(ip, &vp);
 1205                         hammer_rel_inode(ip, 0);
 1206                 } else {
 1207                         vp = NULL;
 1208                 }
 1209                 if (error == 0) {
 1210                         vn_unlock(vp);
 1211                         cache_setvp(ap->a_nch, vp);
 1212                         vrele(vp);
 1213                 }
 1214                 goto done;
 1215         }
 1216 
 1217         /*
 1218          * Calculate the namekey and setup the key range for the scan.  This
 1219          * works kinda like a chained hash table where the lower 32 bits
 1220          * of the namekey synthesize the chain.
 1221          *
 1222          * The key range is inclusive of both key_beg and key_end.
 1223          */
 1224         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
 1225                                            &max_iterations);
 1226 
 1227         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
 1228         cursor.key_beg.localization = dip->obj_localization +
 1229                                       hammer_dir_localization(dip);
 1230         cursor.key_beg.obj_id = dip->obj_id;
 1231         cursor.key_beg.key = namekey;
 1232         cursor.key_beg.create_tid = 0;
 1233         cursor.key_beg.delete_tid = 0;
 1234         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
 1235         cursor.key_beg.obj_type = 0;
 1236 
 1237         cursor.key_end = cursor.key_beg;
 1238         cursor.key_end.key += max_iterations;
 1239         cursor.asof = asof;
 1240         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 1241 
 1242         /*
 1243          * Scan all matching records (the chain), locate the one matching
 1244          * the requested path component.
 1245          *
 1246          * The hammer_ip_*() functions merge in-memory records with on-disk
 1247          * records for the purposes of the search.
 1248          */
 1249         obj_id = 0;
 1250         localization = HAMMER_DEF_LOCALIZATION;
 1251 
 1252         if (error == 0) {
 1253                 error = hammer_ip_first(&cursor);
 1254                 while (error == 0) {
 1255                         error = hammer_ip_resolve_data(&cursor);
 1256                         if (error)
 1257                                 break;
 1258                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
 1259                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
 1260                                 obj_id = cursor.data->entry.obj_id;
 1261                                 localization = cursor.data->entry.localization;
 1262                                 break;
 1263                         }
 1264                         error = hammer_ip_next(&cursor);
 1265                 }
 1266         }
 1267         hammer_done_cursor(&cursor);
 1268 
 1269         /*
 1270          * Lookup the obj_id.  This should always succeed.  If it does not
 1271          * the filesystem may be damaged and we return a dummy inode.
 1272          */
 1273         if (error == 0) {
 1274                 ip = hammer_get_inode(&trans, dip, obj_id,
 1275                                       asof, localization,
 1276                                       flags, &error);
 1277                 if (error == ENOENT) {
 1278                         kprintf("HAMMER: WARNING: Missing "
 1279                                 "inode for dirent \"%s\"\n"
 1280                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
 1281                                 ncp->nc_name,
 1282                                 (long long)obj_id, (long long)asof,
 1283                                 localization);
 1284                         error = 0;
 1285                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
 1286                                                     asof, localization,
 1287                                                     flags, &error);
 1288                 }
 1289                 if (error == 0) {
 1290                         error = hammer_get_vnode(ip, &vp);
 1291                         hammer_rel_inode(ip, 0);
 1292                 } else {
 1293                         vp = NULL;
 1294                 }
 1295                 if (error == 0) {
 1296                         vn_unlock(vp);
 1297                         cache_setvp(ap->a_nch, vp);
 1298                         vrele(vp);
 1299                 }
 1300         } else if (error == ENOENT) {
 1301                 cache_setvp(ap->a_nch, NULL);
 1302         }
 1303 done:
 1304         hammer_done_transaction(&trans);
 1305         lwkt_reltoken(&hmp->fs_token);
 1306         return (error);
 1307 }
 1308 
 1309 /*
 1310  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
 1311  *
 1312  * Locate the parent directory of a directory vnode.
 1313  *
 1314  * dvp is referenced but not locked.  *vpp must be returned referenced and
 1315  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
 1316  * at the root, instead it could indicate that the directory we were in was
 1317  * removed.
 1318  *
 1319  * NOTE: as-of sequences are not linked into the directory structure.  If
 1320  * we are at the root with a different asof then the mount point, reload
 1321  * the same directory with the mount point's asof.   I'm not sure what this
 1322  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
 1323  * get confused, but it hasn't been tested.
 1324  */
 1325 static
 1326 int
 1327 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
 1328 {
 1329         struct hammer_transaction trans;
 1330         struct hammer_inode *dip;
 1331         struct hammer_inode *ip;
 1332         hammer_mount_t hmp;
 1333         int64_t parent_obj_id;
 1334         u_int32_t parent_obj_localization;
 1335         hammer_tid_t asof;
 1336         int error;
 1337 
 1338         dip = VTOI(ap->a_dvp);
 1339         asof = dip->obj_asof;
 1340         hmp = dip->hmp;
 1341 
 1342         /*
 1343          * Whos are parent?  This could be the root of a pseudo-filesystem
 1344          * whos parent is in another localization domain.
 1345          */
 1346         lwkt_gettoken(&hmp->fs_token);
 1347         parent_obj_id = dip->ino_data.parent_obj_id;
 1348         if (dip->obj_id == HAMMER_OBJID_ROOT)
 1349                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
 1350         else
 1351                 parent_obj_localization = dip->obj_localization;
 1352 
 1353         if (parent_obj_id == 0) {
 1354                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
 1355                    asof != hmp->asof) {
 1356                         parent_obj_id = dip->obj_id;
 1357                         asof = hmp->asof;
 1358                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
 1359                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
 1360                                   (long long)dip->obj_asof);
 1361                 } else {
 1362                         *ap->a_vpp = NULL;
 1363                         lwkt_reltoken(&hmp->fs_token);
 1364                         return ENOENT;
 1365                 }
 1366         }
 1367 
 1368         hammer_simple_transaction(&trans, hmp);
 1369         ++hammer_stats_file_iopsr;
 1370 
 1371         ip = hammer_get_inode(&trans, dip, parent_obj_id,
 1372                               asof, parent_obj_localization,
 1373                               dip->flags, &error);
 1374         if (ip) {
 1375                 error = hammer_get_vnode(ip, ap->a_vpp);
 1376                 hammer_rel_inode(ip, 0);
 1377         } else {
 1378                 *ap->a_vpp = NULL;
 1379         }
 1380         hammer_done_transaction(&trans);
 1381         lwkt_reltoken(&hmp->fs_token);
 1382         return (error);
 1383 }
 1384 
 1385 /*
 1386  * hammer_vop_nlink { nch, dvp, vp, cred }
 1387  */
 1388 static
 1389 int
 1390 hammer_vop_nlink(struct vop_nlink_args *ap)
 1391 {
 1392         struct hammer_transaction trans;
 1393         struct hammer_inode *dip;
 1394         struct hammer_inode *ip;
 1395         struct nchandle *nch;
 1396         hammer_mount_t hmp;
 1397         int error;
 1398 
 1399         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
 1400                 return(EXDEV);
 1401 
 1402         nch = ap->a_nch;
 1403         dip = VTOI(ap->a_dvp);
 1404         ip = VTOI(ap->a_vp);
 1405         hmp = dip->hmp;
 1406 
 1407         if (dip->obj_localization != ip->obj_localization)
 1408                 return(EXDEV);
 1409 
 1410         if (dip->flags & HAMMER_INODE_RO)
 1411                 return (EROFS);
 1412         if (ip->flags & HAMMER_INODE_RO)
 1413                 return (EROFS);
 1414         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
 1415                 return (error);
 1416 
 1417         /*
 1418          * Create a transaction to cover the operations we perform.
 1419          */
 1420         lwkt_gettoken(&hmp->fs_token);
 1421         hammer_start_transaction(&trans, hmp);
 1422         ++hammer_stats_file_iopsw;
 1423 
 1424         /*
 1425          * Add the filesystem object to the directory.  Note that neither
 1426          * dip nor ip are referenced or locked, but their vnodes are
 1427          * referenced.  This function will bump the inode's link count.
 1428          */
 1429         error = hammer_ip_add_directory(&trans, dip,
 1430                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
 1431                                         ip);
 1432 
 1433         /*
 1434          * Finish up.
 1435          */
 1436         if (error == 0) {
 1437                 cache_setunresolved(nch);
 1438                 cache_setvp(nch, ap->a_vp);
 1439         }
 1440         hammer_done_transaction(&trans);
 1441         hammer_knote(ap->a_vp, NOTE_LINK);
 1442         hammer_knote(ap->a_dvp, NOTE_WRITE);
 1443         lwkt_reltoken(&hmp->fs_token);
 1444         return (error);
 1445 }
 1446 
 1447 /*
 1448  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
 1449  *
 1450  * The operating system has already ensured that the directory entry
 1451  * does not exist and done all appropriate namespace locking.
 1452  */
 1453 static
 1454 int
 1455 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
 1456 {
 1457         struct hammer_transaction trans;
 1458         struct hammer_inode *dip;
 1459         struct hammer_inode *nip;
 1460         struct nchandle *nch;
 1461         hammer_mount_t hmp;
 1462         int error;
 1463 
 1464         nch = ap->a_nch;
 1465         dip = VTOI(ap->a_dvp);
 1466         hmp = dip->hmp;
 1467 
 1468         if (dip->flags & HAMMER_INODE_RO)
 1469                 return (EROFS);
 1470         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
 1471                 return (error);
 1472 
 1473         /*
 1474          * Create a transaction to cover the operations we perform.
 1475          */
 1476         lwkt_gettoken(&hmp->fs_token);
 1477         hammer_start_transaction(&trans, hmp);
 1478         ++hammer_stats_file_iopsw;
 1479 
 1480         /*
 1481          * Create a new filesystem object of the requested type.  The
 1482          * returned inode will be referenced but not locked.
 1483          */
 1484         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
 1485                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
 1486                                     NULL, &nip);
 1487         if (error) {
 1488                 hkprintf("hammer_mkdir error %d\n", error);
 1489                 hammer_done_transaction(&trans);
 1490                 *ap->a_vpp = NULL;
 1491                 lwkt_reltoken(&hmp->fs_token);
 1492                 return (error);
 1493         }
 1494         /*
 1495          * Add the new filesystem object to the directory.  This will also
 1496          * bump the inode's link count.
 1497          */
 1498         error = hammer_ip_add_directory(&trans, dip,
 1499                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
 1500                                         nip);
 1501         if (error)
 1502                 hkprintf("hammer_mkdir (add) error %d\n", error);
 1503 
 1504         /*
 1505          * Finish up.
 1506          */
 1507         if (error) {
 1508                 hammer_rel_inode(nip, 0);
 1509                 *ap->a_vpp = NULL;
 1510         } else {
 1511                 error = hammer_get_vnode(nip, ap->a_vpp);
 1512                 hammer_rel_inode(nip, 0);
 1513                 if (error == 0) {
 1514                         cache_setunresolved(ap->a_nch);
 1515                         cache_setvp(ap->a_nch, *ap->a_vpp);
 1516                 }
 1517         }
 1518         hammer_done_transaction(&trans);
 1519         if (error == 0)
 1520                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
 1521         lwkt_reltoken(&hmp->fs_token);
 1522         return (error);
 1523 }
 1524 
 1525 /*
 1526  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
 1527  *
 1528  * The operating system has already ensured that the directory entry
 1529  * does not exist and done all appropriate namespace locking.
 1530  */
 1531 static
 1532 int
 1533 hammer_vop_nmknod(struct vop_nmknod_args *ap)
 1534 {
 1535         struct hammer_transaction trans;
 1536         struct hammer_inode *dip;
 1537         struct hammer_inode *nip;
 1538         struct nchandle *nch;
 1539         hammer_mount_t hmp;
 1540         int error;
 1541 
 1542         nch = ap->a_nch;
 1543         dip = VTOI(ap->a_dvp);
 1544         hmp = dip->hmp;
 1545 
 1546         if (dip->flags & HAMMER_INODE_RO)
 1547                 return (EROFS);
 1548         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
 1549                 return (error);
 1550 
 1551         /*
 1552          * Create a transaction to cover the operations we perform.
 1553          */
 1554         lwkt_gettoken(&hmp->fs_token);
 1555         hammer_start_transaction(&trans, hmp);
 1556         ++hammer_stats_file_iopsw;
 1557 
 1558         /*
 1559          * Create a new filesystem object of the requested type.  The
 1560          * returned inode will be referenced but not locked.
 1561          *
 1562          * If mknod specifies a directory a pseudo-fs is created.
 1563          */
 1564         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
 1565                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
 1566                                     NULL, &nip);
 1567         if (error) {
 1568                 hammer_done_transaction(&trans);
 1569                 *ap->a_vpp = NULL;
 1570                 lwkt_reltoken(&hmp->fs_token);
 1571                 return (error);
 1572         }
 1573 
 1574         /*
 1575          * Add the new filesystem object to the directory.  This will also
 1576          * bump the inode's link count.
 1577          */
 1578         error = hammer_ip_add_directory(&trans, dip,
 1579                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
 1580                                         nip);
 1581 
 1582         /*
 1583          * Finish up.
 1584          */
 1585         if (error) {
 1586                 hammer_rel_inode(nip, 0);
 1587                 *ap->a_vpp = NULL;
 1588         } else {
 1589                 error = hammer_get_vnode(nip, ap->a_vpp);
 1590                 hammer_rel_inode(nip, 0);
 1591                 if (error == 0) {
 1592                         cache_setunresolved(ap->a_nch);
 1593                         cache_setvp(ap->a_nch, *ap->a_vpp);
 1594                 }
 1595         }
 1596         hammer_done_transaction(&trans);
 1597         if (error == 0)
 1598                 hammer_knote(ap->a_dvp, NOTE_WRITE);
 1599         lwkt_reltoken(&hmp->fs_token);
 1600         return (error);
 1601 }
 1602 
 1603 /*
 1604  * hammer_vop_open { vp, mode, cred, fp }
 1605  *
 1606  * MPSAFE (does not require fs_token)
 1607  */
 1608 static
 1609 int
 1610 hammer_vop_open(struct vop_open_args *ap)
 1611 {
 1612         hammer_inode_t ip;
 1613 
 1614         ++hammer_stats_file_iopsr;
 1615         ip = VTOI(ap->a_vp);
 1616 
 1617         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
 1618                 return (EROFS);
 1619         return(vop_stdopen(ap));
 1620 }
 1621 
 1622 /*
 1623  * hammer_vop_print { vp }
 1624  */
 1625 static
 1626 int
 1627 hammer_vop_print(struct vop_print_args *ap)
 1628 {
 1629         return EOPNOTSUPP;
 1630 }
 1631 
 1632 /*
 1633  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
 1634  */
 1635 static
 1636 int
 1637 hammer_vop_readdir(struct vop_readdir_args *ap)
 1638 {
 1639         struct hammer_transaction trans;
 1640         struct hammer_cursor cursor;
 1641         struct hammer_inode *ip;
 1642         hammer_mount_t hmp;
 1643         struct uio *uio;
 1644         hammer_base_elm_t base;
 1645         int error;
 1646         int cookie_index;
 1647         int ncookies;
 1648         off_t *cookies;
 1649         off_t saveoff;
 1650         int r;
 1651         int dtype;
 1652 
 1653         ++hammer_stats_file_iopsr;
 1654         ip = VTOI(ap->a_vp);
 1655         uio = ap->a_uio;
 1656         saveoff = uio->uio_offset;
 1657         hmp = ip->hmp;
 1658 
 1659         if (ap->a_ncookies) {
 1660                 ncookies = uio->uio_resid / 16 + 1;
 1661                 if (ncookies > 1024)
 1662                         ncookies = 1024;
 1663                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
 1664                 cookie_index = 0;
 1665         } else {
 1666                 ncookies = -1;
 1667                 cookies = NULL;
 1668                 cookie_index = 0;
 1669         }
 1670 
 1671         lwkt_gettoken(&hmp->fs_token);
 1672         hammer_simple_transaction(&trans, hmp);
 1673 
 1674         /*
 1675          * Handle artificial entries
 1676          *
 1677          * It should be noted that the minimum value for a directory
 1678          * hash key on-media is 0x0000000100000000, so we can use anything
 1679          * less then that to represent our 'special' key space.
 1680          */
 1681         error = 0;
 1682         if (saveoff == 0) {
 1683                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
 1684                 if (r)
 1685                         goto done;
 1686                 if (cookies)
 1687                         cookies[cookie_index] = saveoff;
 1688                 ++saveoff;
 1689                 ++cookie_index;
 1690                 if (cookie_index == ncookies)
 1691                         goto done;
 1692         }
 1693         if (saveoff == 1) {
 1694                 if (ip->ino_data.parent_obj_id) {
 1695                         r = vop_write_dirent(&error, uio,
 1696                                              ip->ino_data.parent_obj_id,
 1697                                              DT_DIR, 2, "..");
 1698                 } else {
 1699                         r = vop_write_dirent(&error, uio,
 1700                                              ip->obj_id, DT_DIR, 2, "..");
 1701                 }
 1702                 if (r)
 1703                         goto done;
 1704                 if (cookies)
 1705                         cookies[cookie_index] = saveoff;
 1706                 ++saveoff;
 1707                 ++cookie_index;
 1708                 if (cookie_index == ncookies)
 1709                         goto done;
 1710         }
 1711 
 1712         /*
 1713          * Key range (begin and end inclusive) to scan.  Directory keys
 1714          * directly translate to a 64 bit 'seek' position.
 1715          */
 1716         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 1717         cursor.key_beg.localization = ip->obj_localization +
 1718                                       hammer_dir_localization(ip);
 1719         cursor.key_beg.obj_id = ip->obj_id;
 1720         cursor.key_beg.create_tid = 0;
 1721         cursor.key_beg.delete_tid = 0;
 1722         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
 1723         cursor.key_beg.obj_type = 0;
 1724         cursor.key_beg.key = saveoff;
 1725 
 1726         cursor.key_end = cursor.key_beg;
 1727         cursor.key_end.key = HAMMER_MAX_KEY;
 1728         cursor.asof = ip->obj_asof;
 1729         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 1730 
 1731         error = hammer_ip_first(&cursor);
 1732 
 1733         while (error == 0) {
 1734                 error = hammer_ip_resolve_data(&cursor);
 1735                 if (error)
 1736                         break;
 1737                 base = &cursor.leaf->base;
 1738                 saveoff = base->key;
 1739                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
 1740 
 1741                 if (base->obj_id != ip->obj_id)
 1742                         panic("readdir: bad record at %p", cursor.node);
 1743 
 1744                 /*
 1745                  * Convert pseudo-filesystems into softlinks
 1746                  */
 1747                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
 1748                 r = vop_write_dirent(
 1749                              &error, uio, cursor.data->entry.obj_id,
 1750                              dtype,
 1751                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
 1752                              (void *)cursor.data->entry.name);
 1753                 if (r)
 1754                         break;
 1755                 ++saveoff;
 1756                 if (cookies)
 1757                         cookies[cookie_index] = base->key;
 1758                 ++cookie_index;
 1759                 if (cookie_index == ncookies)
 1760                         break;
 1761                 error = hammer_ip_next(&cursor);
 1762         }
 1763         hammer_done_cursor(&cursor);
 1764 
 1765 done:
 1766         hammer_done_transaction(&trans);
 1767 
 1768         if (ap->a_eofflag)
 1769                 *ap->a_eofflag = (error == ENOENT);
 1770         uio->uio_offset = saveoff;
 1771         if (error && cookie_index == 0) {
 1772                 if (error == ENOENT)
 1773                         error = 0;
 1774                 if (cookies) {
 1775                         kfree(cookies, M_TEMP);
 1776                         *ap->a_ncookies = 0;
 1777                         *ap->a_cookies = NULL;
 1778                 }
 1779         } else {
 1780                 if (error == ENOENT)
 1781                         error = 0;
 1782                 if (cookies) {
 1783                         *ap->a_ncookies = cookie_index;
 1784                         *ap->a_cookies = cookies;
 1785                 }
 1786         }
 1787         lwkt_reltoken(&hmp->fs_token);
 1788         return(error);
 1789 }
 1790 
 1791 /*
 1792  * hammer_vop_readlink { vp, uio, cred }
 1793  */
 1794 static
 1795 int
 1796 hammer_vop_readlink(struct vop_readlink_args *ap)
 1797 {
 1798         struct hammer_transaction trans;
 1799         struct hammer_cursor cursor;
 1800         struct hammer_inode *ip;
 1801         hammer_mount_t hmp;
 1802         char buf[32];
 1803         u_int32_t localization;
 1804         hammer_pseudofs_inmem_t pfsm;
 1805         int error;
 1806 
 1807         ip = VTOI(ap->a_vp);
 1808         hmp = ip->hmp;
 1809 
 1810         lwkt_gettoken(&hmp->fs_token);
 1811 
 1812         /*
 1813          * Shortcut if the symlink data was stuffed into ino_data.
 1814          *
 1815          * Also expand special "@@PFS%05d" softlinks (expansion only
 1816          * occurs for non-historical (current) accesses made from the
 1817          * primary filesystem).
 1818          */
 1819         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
 1820                 char *ptr;
 1821                 int bytes;
 1822 
 1823                 ptr = ip->ino_data.ext.symlink;
 1824                 bytes = (int)ip->ino_data.size;
 1825                 if (bytes == 10 &&
 1826                     ip->obj_asof == HAMMER_MAX_TID &&
 1827                     ip->obj_localization == 0 &&
 1828                     strncmp(ptr, "@@PFS", 5) == 0) {
 1829                         hammer_simple_transaction(&trans, hmp);
 1830                         bcopy(ptr + 5, buf, 5);
 1831                         buf[5] = 0;
 1832                         localization = strtoul(buf, NULL, 10) << 16;
 1833                         pfsm = hammer_load_pseudofs(&trans, localization,
 1834                                                     &error);
 1835                         if (error == 0) {
 1836                                 if (pfsm->pfsd.mirror_flags &
 1837                                     HAMMER_PFSD_SLAVE) {
 1838                                         /* vap->va_size == 26 */
 1839                                         ksnprintf(buf, sizeof(buf),
 1840                                                   "@@0x%016llx:%05d",
 1841                                                   (long long)pfsm->pfsd.sync_end_tid,
 1842                                                   localization >> 16);
 1843                                 } else {
 1844                                         /* vap->va_size == 10 */
 1845                                         ksnprintf(buf, sizeof(buf),
 1846                                                   "@@-1:%05d",
 1847                                                   localization >> 16);
 1848 #if 0
 1849                                         ksnprintf(buf, sizeof(buf),
 1850                                                   "@@0x%016llx:%05d",
 1851                                                   (long long)HAMMER_MAX_TID,
 1852                                                   localization >> 16);
 1853 #endif
 1854                                 }
 1855                                 ptr = buf;
 1856                                 bytes = strlen(buf);
 1857                         }
 1858                         if (pfsm)
 1859                                 hammer_rel_pseudofs(hmp, pfsm);
 1860                         hammer_done_transaction(&trans);
 1861                 }
 1862                 error = uiomove(ptr, bytes, ap->a_uio);
 1863                 lwkt_reltoken(&hmp->fs_token);
 1864                 return(error);
 1865         }
 1866 
 1867         /*
 1868          * Long version
 1869          */
 1870         hammer_simple_transaction(&trans, hmp);
 1871         ++hammer_stats_file_iopsr;
 1872         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 1873 
 1874         /*
 1875          * Key range (begin and end inclusive) to scan.  Directory keys
 1876          * directly translate to a 64 bit 'seek' position.
 1877          */
 1878         cursor.key_beg.localization = ip->obj_localization +
 1879                                       HAMMER_LOCALIZE_MISC;
 1880         cursor.key_beg.obj_id = ip->obj_id;
 1881         cursor.key_beg.create_tid = 0;
 1882         cursor.key_beg.delete_tid = 0;
 1883         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
 1884         cursor.key_beg.obj_type = 0;
 1885         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
 1886         cursor.asof = ip->obj_asof;
 1887         cursor.flags |= HAMMER_CURSOR_ASOF;
 1888 
 1889         error = hammer_ip_lookup(&cursor);
 1890         if (error == 0) {
 1891                 error = hammer_ip_resolve_data(&cursor);
 1892                 if (error == 0) {
 1893                         KKASSERT(cursor.leaf->data_len >=
 1894                                  HAMMER_SYMLINK_NAME_OFF);
 1895                         error = uiomove(cursor.data->symlink.name,
 1896                                         cursor.leaf->data_len -
 1897                                                 HAMMER_SYMLINK_NAME_OFF,
 1898                                         ap->a_uio);
 1899                 }
 1900         }
 1901         hammer_done_cursor(&cursor);
 1902         hammer_done_transaction(&trans);
 1903         lwkt_reltoken(&hmp->fs_token);
 1904         return(error);
 1905 }
 1906 
 1907 /*
 1908  * hammer_vop_nremove { nch, dvp, cred }
 1909  */
 1910 static
 1911 int
 1912 hammer_vop_nremove(struct vop_nremove_args *ap)
 1913 {
 1914         struct hammer_transaction trans;
 1915         struct hammer_inode *dip;
 1916         hammer_mount_t hmp;
 1917         int error;
 1918 
 1919         dip = VTOI(ap->a_dvp);
 1920         hmp = dip->hmp;
 1921 
 1922         if (hammer_nohistory(dip) == 0 &&
 1923             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
 1924                 return (error);
 1925         }
 1926 
 1927         lwkt_gettoken(&hmp->fs_token);
 1928         hammer_start_transaction(&trans, hmp);
 1929         ++hammer_stats_file_iopsw;
 1930         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
 1931         hammer_done_transaction(&trans);
 1932         if (error == 0)
 1933                 hammer_knote(ap->a_dvp, NOTE_WRITE);
 1934         lwkt_reltoken(&hmp->fs_token);
 1935         return (error);
 1936 }
 1937 
 1938 /*
 1939  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
 1940  */
 1941 static
 1942 int
 1943 hammer_vop_nrename(struct vop_nrename_args *ap)
 1944 {
 1945         struct hammer_transaction trans;
 1946         struct namecache *fncp;
 1947         struct namecache *tncp;
 1948         struct hammer_inode *fdip;
 1949         struct hammer_inode *tdip;
 1950         struct hammer_inode *ip;
 1951         hammer_mount_t hmp;
 1952         struct hammer_cursor cursor;
 1953         int64_t namekey;
 1954         u_int32_t max_iterations;
 1955         int nlen, error;
 1956 
 1957         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
 1958                 return(EXDEV);
 1959         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
 1960                 return(EXDEV);
 1961 
 1962         fdip = VTOI(ap->a_fdvp);
 1963         tdip = VTOI(ap->a_tdvp);
 1964         fncp = ap->a_fnch->ncp;
 1965         tncp = ap->a_tnch->ncp;
 1966         ip = VTOI(fncp->nc_vp);
 1967         KKASSERT(ip != NULL);
 1968 
 1969         hmp = ip->hmp;
 1970 
 1971         if (fdip->obj_localization != tdip->obj_localization)
 1972                 return(EXDEV);
 1973         if (fdip->obj_localization != ip->obj_localization)
 1974                 return(EXDEV);
 1975 
 1976         if (fdip->flags & HAMMER_INODE_RO)
 1977                 return (EROFS);
 1978         if (tdip->flags & HAMMER_INODE_RO)
 1979                 return (EROFS);
 1980         if (ip->flags & HAMMER_INODE_RO)
 1981                 return (EROFS);
 1982         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
 1983                 return (error);
 1984 
 1985         lwkt_gettoken(&hmp->fs_token);
 1986         hammer_start_transaction(&trans, hmp);
 1987         ++hammer_stats_file_iopsw;
 1988 
 1989         /*
 1990          * Remove tncp from the target directory and then link ip as
 1991          * tncp. XXX pass trans to dounlink
 1992          *
 1993          * Force the inode sync-time to match the transaction so it is
 1994          * in-sync with the creation of the target directory entry.
 1995          */
 1996         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
 1997                                 ap->a_cred, 0, -1);
 1998         if (error == 0 || error == ENOENT) {
 1999                 error = hammer_ip_add_directory(&trans, tdip,
 2000                                                 tncp->nc_name, tncp->nc_nlen,
 2001                                                 ip);
 2002                 if (error == 0) {
 2003                         ip->ino_data.parent_obj_id = tdip->obj_id;
 2004                         ip->ino_data.ctime = trans.time;
 2005                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
 2006                 }
 2007         }
 2008         if (error)
 2009                 goto failed; /* XXX */
 2010 
 2011         /*
 2012          * Locate the record in the originating directory and remove it.
 2013          *
 2014          * Calculate the namekey and setup the key range for the scan.  This
 2015          * works kinda like a chained hash table where the lower 32 bits
 2016          * of the namekey synthesize the chain.
 2017          *
 2018          * The key range is inclusive of both key_beg and key_end.
 2019          */
 2020         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
 2021                                            &max_iterations);
 2022 retry:
 2023         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
 2024         cursor.key_beg.localization = fdip->obj_localization +
 2025                                       hammer_dir_localization(fdip);
 2026         cursor.key_beg.obj_id = fdip->obj_id;
 2027         cursor.key_beg.key = namekey;
 2028         cursor.key_beg.create_tid = 0;
 2029         cursor.key_beg.delete_tid = 0;
 2030         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
 2031         cursor.key_beg.obj_type = 0;
 2032 
 2033         cursor.key_end = cursor.key_beg;
 2034         cursor.key_end.key += max_iterations;
 2035         cursor.asof = fdip->obj_asof;
 2036         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 2037 
 2038         /*
 2039          * Scan all matching records (the chain), locate the one matching
 2040          * the requested path component.
 2041          *
 2042          * The hammer_ip_*() functions merge in-memory records with on-disk
 2043          * records for the purposes of the search.
 2044          */
 2045         error = hammer_ip_first(&cursor);
 2046         while (error == 0) {
 2047                 if (hammer_ip_resolve_data(&cursor) != 0)
 2048                         break;
 2049                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
 2050                 KKASSERT(nlen > 0);
 2051                 if (fncp->nc_nlen == nlen &&
 2052                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
 2053                         break;
 2054                 }
 2055                 error = hammer_ip_next(&cursor);
 2056         }
 2057 
 2058         /*
 2059          * If all is ok we have to get the inode so we can adjust nlinks.
 2060          *
 2061          * WARNING: hammer_ip_del_directory() may have to terminate the
 2062          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
 2063          * twice.
 2064          */
 2065         if (error == 0)
 2066                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
 2067 
 2068         /*
 2069          * XXX A deadlock here will break rename's atomicy for the purposes
 2070          * of crash recovery.
 2071          */
 2072         if (error == EDEADLK) {
 2073                 hammer_done_cursor(&cursor);
 2074                 goto retry;
 2075         }
 2076 
 2077         /*
 2078          * Cleanup and tell the kernel that the rename succeeded.
 2079          *
 2080          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
 2081          *       without formally acquiring the vp since the vp might
 2082          *       have zero refs on it, or in the middle of a reclaim,
 2083          *       etc.
 2084          */
 2085         hammer_done_cursor(&cursor);
 2086         if (error == 0) {
 2087                 cache_rename(ap->a_fnch, ap->a_tnch);
 2088                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
 2089                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
 2090                 while (ip->vp) {
 2091                         struct vnode *vp;
 2092 
 2093                         error = hammer_get_vnode(ip, &vp);
 2094                         if (error == 0 && vp) {
 2095                                 vn_unlock(vp);
 2096                                 hammer_knote(ip->vp, NOTE_RENAME);
 2097                                 vrele(vp);
 2098                                 break;
 2099                         }
 2100                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
 2101                 }
 2102         }
 2103 
 2104 failed:
 2105         hammer_done_transaction(&trans);
 2106         lwkt_reltoken(&hmp->fs_token);
 2107         return (error);
 2108 }
 2109 
 2110 /*
 2111  * hammer_vop_nrmdir { nch, dvp, cred }
 2112  */
 2113 static
 2114 int
 2115 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
 2116 {
 2117         struct hammer_transaction trans;
 2118         struct hammer_inode *dip;
 2119         hammer_mount_t hmp;
 2120         int error;
 2121 
 2122         dip = VTOI(ap->a_dvp);
 2123         hmp = dip->hmp;
 2124 
 2125         if (hammer_nohistory(dip) == 0 &&
 2126             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
 2127                 return (error);
 2128         }
 2129 
 2130         lwkt_gettoken(&hmp->fs_token);
 2131         hammer_start_transaction(&trans, hmp);
 2132         ++hammer_stats_file_iopsw;
 2133         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
 2134         hammer_done_transaction(&trans);
 2135         if (error == 0)
 2136                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
 2137         lwkt_reltoken(&hmp->fs_token);
 2138         return (error);
 2139 }
 2140 
 2141 /*
 2142  * hammer_vop_markatime { vp, cred }
 2143  */
 2144 static
 2145 int
 2146 hammer_vop_markatime(struct vop_markatime_args *ap)
 2147 {
 2148         struct hammer_transaction trans;
 2149         struct hammer_inode *ip;
 2150         hammer_mount_t hmp;
 2151 
 2152         ip = VTOI(ap->a_vp);
 2153         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 2154                 return (EROFS);
 2155         if (ip->flags & HAMMER_INODE_RO)
 2156                 return (EROFS);
 2157         hmp = ip->hmp;
 2158         if (hmp->mp->mnt_flag & MNT_NOATIME)
 2159                 return (0);
 2160         lwkt_gettoken(&hmp->fs_token);
 2161         hammer_start_transaction(&trans, hmp);
 2162         ++hammer_stats_file_iopsw;
 2163 
 2164         ip->ino_data.atime = trans.time;
 2165         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
 2166         hammer_done_transaction(&trans);
 2167         hammer_knote(ap->a_vp, NOTE_ATTRIB);
 2168         lwkt_reltoken(&hmp->fs_token);
 2169         return (0);
 2170 }
 2171 
 2172 /*
 2173  * hammer_vop_setattr { vp, vap, cred }
 2174  */
 2175 static
 2176 int
 2177 hammer_vop_setattr(struct vop_setattr_args *ap)
 2178 {
 2179         struct hammer_transaction trans;
 2180         struct hammer_inode *ip;
 2181         struct vattr *vap;
 2182         hammer_mount_t hmp;
 2183         int modflags;
 2184         int error;
 2185         int truncating;
 2186         int blksize;
 2187         int kflags;
 2188 #if 0
 2189         int64_t aligned_size;
 2190 #endif
 2191         u_int32_t flags;
 2192 
 2193         vap = ap->a_vap;
 2194         ip = ap->a_vp->v_data;
 2195         modflags = 0;
 2196         kflags = 0;
 2197         hmp = ip->hmp;
 2198 
 2199         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
 2200                 return(EROFS);
 2201         if (ip->flags & HAMMER_INODE_RO)
 2202                 return (EROFS);
 2203         if (hammer_nohistory(ip) == 0 &&
 2204             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
 2205                 return (error);
 2206         }
 2207 
 2208         lwkt_gettoken(&hmp->fs_token);
 2209         hammer_start_transaction(&trans, hmp);
 2210         ++hammer_stats_file_iopsw;
 2211         error = 0;
 2212 
 2213         if (vap->va_flags != VNOVAL) {
 2214                 flags = ip->ino_data.uflags;
 2215                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
 2216                                          hammer_to_unix_xid(&ip->ino_data.uid),
 2217                                          ap->a_cred);
 2218                 if (error == 0) {
 2219                         if (ip->ino_data.uflags != flags) {
 2220                                 ip->ino_data.uflags = flags;
 2221                                 ip->ino_data.ctime = trans.time;
 2222                                 modflags |= HAMMER_INODE_DDIRTY;
 2223                                 kflags |= NOTE_ATTRIB;
 2224                         }
 2225                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
 2226                                 error = 0;
 2227                                 goto done;
 2228                         }
 2229                 }
 2230                 goto done;
 2231         }
 2232         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
 2233                 error = EPERM;
 2234                 goto done;
 2235         }
 2236         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 2237                 mode_t cur_mode = ip->ino_data.mode;
 2238                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
 2239                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
 2240                 uuid_t uuid_uid;
 2241                 uuid_t uuid_gid;
 2242 
 2243                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
 2244                                          ap->a_cred,
 2245                                          &cur_uid, &cur_gid, &cur_mode);
 2246                 if (error == 0) {
 2247                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
 2248                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
 2249                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
 2250                                  sizeof(uuid_uid)) ||
 2251                             bcmp(&uuid_gid, &ip->ino_data.gid,
 2252                                  sizeof(uuid_gid)) ||
 2253                             ip->ino_data.mode != cur_mode
 2254                         ) {
 2255                                 ip->ino_data.uid = uuid_uid;
 2256                                 ip->ino_data.gid = uuid_gid;
 2257                                 ip->ino_data.mode = cur_mode;
 2258                                 ip->ino_data.ctime = trans.time;
 2259                                 modflags |= HAMMER_INODE_DDIRTY;
 2260                         }
 2261                         kflags |= NOTE_ATTRIB;
 2262                 }
 2263         }
 2264         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
 2265                 switch(ap->a_vp->v_type) {
 2266                 case VREG:
 2267                         if (vap->va_size == ip->ino_data.size)
 2268                                 break;
 2269 
 2270                         /*
 2271                          * Log the operation if in fast-fsync mode or if
 2272                          * there are unterminated redo write records present.
 2273                          *
 2274                          * The second check is needed so the recovery code
 2275                          * properly truncates write redos even if nominal
 2276                          * REDO operations is turned off due to excessive
 2277                          * writes, because the related records might be
 2278                          * destroyed and never lay down a TERM_WRITE.
 2279                          */
 2280                         if ((ip->flags & HAMMER_INODE_REDO) ||
 2281                             (ip->flags & HAMMER_INODE_RDIRTY)) {
 2282                                 error = hammer_generate_redo(&trans, ip,
 2283                                                              vap->va_size,
 2284                                                              HAMMER_REDO_TRUNC,
 2285                                                              NULL, 0);
 2286                         }
 2287                         blksize = hammer_blocksize(vap->va_size);
 2288 
 2289                         /*
 2290                          * XXX break atomicy, we can deadlock the backend
 2291                          * if we do not release the lock.  Probably not a
 2292                          * big deal here.
 2293                          */
 2294                         if (vap->va_size < ip->ino_data.size) {
 2295                                 nvtruncbuf(ap->a_vp, vap->va_size,
 2296                                            blksize,
 2297                                            hammer_blockoff(vap->va_size),
 2298                                            0);
 2299                                 truncating = 1;
 2300                                 kflags |= NOTE_WRITE;
 2301                         } else {
 2302                                 nvextendbuf(ap->a_vp,
 2303                                             ip->ino_data.size,
 2304                                             vap->va_size,
 2305                                             hammer_blocksize(ip->ino_data.size),
 2306                                             hammer_blocksize(vap->va_size),
 2307                                             hammer_blockoff(ip->ino_data.size),
 2308                                             hammer_blockoff(vap->va_size),
 2309                                             0);
 2310                                 truncating = 0;
 2311                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
 2312                         }
 2313                         ip->ino_data.size = vap->va_size;
 2314                         ip->ino_data.mtime = trans.time;
 2315                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
 2316                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
 2317 
 2318                         /*
 2319                          * On-media truncation is cached in the inode until
 2320                          * the inode is synchronized.  We must immediately
 2321                          * handle any frontend records.
 2322                          */
 2323                         if (truncating) {
 2324                                 hammer_ip_frontend_trunc(ip, vap->va_size);
 2325 #ifdef DEBUG_TRUNCATE
 2326                                 if (HammerTruncIp == NULL)
 2327                                         HammerTruncIp = ip;
 2328 #endif
 2329                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
 2330                                         ip->flags |= HAMMER_INODE_TRUNCATED;
 2331                                         ip->trunc_off = vap->va_size;
 2332                                         hammer_inode_dirty(ip);
 2333 #ifdef DEBUG_TRUNCATE
 2334                                         if (ip == HammerTruncIp)
 2335                                         kprintf("truncate1 %016llx\n",
 2336                                                 (long long)ip->trunc_off);
 2337 #endif
 2338                                 } else if (ip->trunc_off > vap->va_size) {
 2339                                         ip->trunc_off = vap->va_size;
 2340 #ifdef DEBUG_TRUNCATE
 2341                                         if (ip == HammerTruncIp)
 2342                                         kprintf("truncate2 %016llx\n",
 2343                                                 (long long)ip->trunc_off);
 2344 #endif
 2345                                 } else {
 2346 #ifdef DEBUG_TRUNCATE
 2347                                         if (ip == HammerTruncIp)
 2348                                         kprintf("truncate3 %016llx (ignored)\n",
 2349                                                 (long long)vap->va_size);
 2350 #endif
 2351                                 }
 2352                         }
 2353 
 2354 #if 0
 2355                         /*
 2356                          * When truncating, nvtruncbuf() may have cleaned out
 2357                          * a portion of the last block on-disk in the buffer
 2358                          * cache.  We must clean out any frontend records
 2359                          * for blocks beyond the new last block.
 2360                          */
 2361                         aligned_size = (vap->va_size + (blksize - 1)) &
 2362                                        ~(int64_t)(blksize - 1);
 2363                         if (truncating && vap->va_size < aligned_size) {
 2364                                 aligned_size -= blksize;
 2365                                 hammer_ip_frontend_trunc(ip, aligned_size);
 2366                         }
 2367 #endif
 2368                         break;
 2369                 case VDATABASE:
 2370                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
 2371                                 ip->flags |= HAMMER_INODE_TRUNCATED;
 2372                                 ip->trunc_off = vap->va_size;
 2373                                 hammer_inode_dirty(ip);
 2374                         } else if (ip->trunc_off > vap->va_size) {
 2375                                 ip->trunc_off = vap->va_size;
 2376                         }
 2377                         hammer_ip_frontend_trunc(ip, vap->va_size);
 2378                         ip->ino_data.size = vap->va_size;
 2379                         ip->ino_data.mtime = trans.time;
 2380                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
 2381                         kflags |= NOTE_ATTRIB;
 2382                         break;
 2383                 default:
 2384                         error = EINVAL;
 2385                         goto done;
 2386                 }
 2387                 break;
 2388         }
 2389         if (vap->va_atime.tv_sec != VNOVAL) {
 2390                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
 2391                 modflags |= HAMMER_INODE_ATIME;
 2392                 kflags |= NOTE_ATTRIB;
 2393         }
 2394         if (vap->va_mtime.tv_sec != VNOVAL) {
 2395                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
 2396                 modflags |= HAMMER_INODE_MTIME;
 2397                 kflags |= NOTE_ATTRIB;
 2398         }
 2399         if (vap->va_mode != (mode_t)VNOVAL) {
 2400                 mode_t   cur_mode = ip->ino_data.mode;
 2401                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
 2402                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
 2403 
 2404                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
 2405                                          cur_uid, cur_gid, &cur_mode);
 2406                 if (error == 0 && ip->ino_data.mode != cur_mode) {
 2407                         ip->ino_data.mode = cur_mode;
 2408                         ip->ino_data.ctime = trans.time;
 2409                         modflags |= HAMMER_INODE_DDIRTY;
 2410                         kflags |= NOTE_ATTRIB;
 2411                 }
 2412         }
 2413 done:
 2414         if (error == 0)
 2415                 hammer_modify_inode(&trans, ip, modflags);
 2416         hammer_done_transaction(&trans);
 2417         hammer_knote(ap->a_vp, kflags);
 2418         lwkt_reltoken(&hmp->fs_token);
 2419         return (error);
 2420 }
 2421 
 2422 /*
 2423  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
 2424  */
 2425 static
 2426 int
 2427 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
 2428 {
 2429         struct hammer_transaction trans;
 2430         struct hammer_inode *dip;
 2431         struct hammer_inode *nip;
 2432         hammer_record_t record;
 2433         struct nchandle *nch;
 2434         hammer_mount_t hmp;
 2435         int error;
 2436         int bytes;
 2437 
 2438         ap->a_vap->va_type = VLNK;
 2439 
 2440         nch = ap->a_nch;
 2441         dip = VTOI(ap->a_dvp);
 2442         hmp = dip->hmp;
 2443 
 2444         if (dip->flags & HAMMER_INODE_RO)
 2445                 return (EROFS);
 2446         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
 2447                 return (error);
 2448 
 2449         /*
 2450          * Create a transaction to cover the operations we perform.
 2451          */
 2452         lwkt_gettoken(&hmp->fs_token);
 2453         hammer_start_transaction(&trans, hmp);
 2454         ++hammer_stats_file_iopsw;
 2455 
 2456         /*
 2457          * Create a new filesystem object of the requested type.  The
 2458          * returned inode will be referenced but not locked.
 2459          */
 2460 
 2461         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
 2462                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
 2463                                     NULL, &nip);
 2464         if (error) {
 2465                 hammer_done_transaction(&trans);
 2466                 *ap->a_vpp = NULL;
 2467                 lwkt_reltoken(&hmp->fs_token);
 2468                 return (error);
 2469         }
 2470 
 2471         /*
 2472          * Add a record representing the symlink.  symlink stores the link
 2473          * as pure data, not a string, and is no \0 terminated.
 2474          */
 2475         if (error == 0) {
 2476                 bytes = strlen(ap->a_target);
 2477 
 2478                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
 2479                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
 2480                 } else {
 2481                         record = hammer_alloc_mem_record(nip, bytes);
 2482                         record->type = HAMMER_MEM_RECORD_GENERAL;
 2483 
 2484                         record->leaf.base.localization = nip->obj_localization +
 2485                                                          HAMMER_LOCALIZE_MISC;
 2486                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
 2487                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
 2488                         record->leaf.data_len = bytes;
 2489                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
 2490                         bcopy(ap->a_target, record->data->symlink.name, bytes);
 2491                         error = hammer_ip_add_record(&trans, record);
 2492                 }
 2493 
 2494                 /*
 2495                  * Set the file size to the length of the link.
 2496                  */
 2497                 if (error == 0) {
 2498                         nip->ino_data.size = bytes;
 2499                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
 2500                 }
 2501         }
 2502         if (error == 0)
 2503                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
 2504                                                 nch->ncp->nc_nlen, nip);
 2505 
 2506         /*
 2507          * Finish up.
 2508          */
 2509         if (error) {
 2510                 hammer_rel_inode(nip, 0);
 2511                 *ap->a_vpp = NULL;
 2512         } else {
 2513                 error = hammer_get_vnode(nip, ap->a_vpp);
 2514                 hammer_rel_inode(nip, 0);
 2515                 if (error == 0) {
 2516                         cache_setunresolved(ap->a_nch);
 2517                         cache_setvp(ap->a_nch, *ap->a_vpp);
 2518                         hammer_knote(ap->a_dvp, NOTE_WRITE);
 2519                 }
 2520         }
 2521         hammer_done_transaction(&trans);
 2522         lwkt_reltoken(&hmp->fs_token);
 2523         return (error);
 2524 }
 2525 
 2526 /*
 2527  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
 2528  */
 2529 static
 2530 int
 2531 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
 2532 {
 2533         struct hammer_transaction trans;
 2534         struct hammer_inode *dip;
 2535         hammer_mount_t hmp;
 2536         int error;
 2537 
 2538         dip = VTOI(ap->a_dvp);
 2539         hmp = dip->hmp;
 2540 
 2541         if (hammer_nohistory(dip) == 0 &&
 2542             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
 2543                 return (error);
 2544         }
 2545 
 2546         lwkt_gettoken(&hmp->fs_token);
 2547         hammer_start_transaction(&trans, hmp);
 2548         ++hammer_stats_file_iopsw;
 2549         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
 2550                                 ap->a_cred, ap->a_flags, -1);
 2551         hammer_done_transaction(&trans);
 2552         lwkt_reltoken(&hmp->fs_token);
 2553 
 2554         return (error);
 2555 }
 2556 
 2557 /*
 2558  * hammer_vop_ioctl { vp, command, data, fflag, cred }
 2559  */
 2560 static
 2561 int
 2562 hammer_vop_ioctl(struct vop_ioctl_args *ap)
 2563 {
 2564         struct hammer_inode *ip = ap->a_vp->v_data;
 2565         hammer_mount_t hmp = ip->hmp;
 2566         int error;
 2567 
 2568         ++hammer_stats_file_iopsr;
 2569         lwkt_gettoken(&hmp->fs_token);
 2570         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
 2571                              ap->a_fflag, ap->a_cred);
 2572         lwkt_reltoken(&hmp->fs_token);
 2573         return (error);
 2574 }
 2575 
 2576 static
 2577 int
 2578 hammer_vop_mountctl(struct vop_mountctl_args *ap)
 2579 {
 2580         static const struct mountctl_opt extraopt[] = {
 2581                 { HMNT_NOHISTORY,       "nohistory" },
 2582                 { HMNT_MASTERID,        "master" },
 2583                 { 0, NULL}
 2584 
 2585         };
 2586         struct hammer_mount *hmp;
 2587         struct mount *mp;
 2588         int usedbytes;
 2589         int error;
 2590 
 2591         error = 0;
 2592         usedbytes = 0;
 2593         mp = ap->a_head.a_ops->head.vv_mount;
 2594         KKASSERT(mp->mnt_data != NULL);
 2595         hmp = (struct hammer_mount *)mp->mnt_data;
 2596 
 2597         lwkt_gettoken(&hmp->fs_token);
 2598 
 2599         switch(ap->a_op) {
 2600         case MOUNTCTL_SET_EXPORT:
 2601                 if (ap->a_ctllen != sizeof(struct export_args))
 2602                         error = EINVAL;
 2603                 else
 2604                         error = hammer_vfs_export(mp, ap->a_op,
 2605                                       (const struct export_args *)ap->a_ctl);
 2606                 break;
 2607         case MOUNTCTL_MOUNTFLAGS:
 2608         {
 2609                 /*
 2610                  * Call standard mountctl VOP function
 2611                  * so we get user mount flags.
 2612                  */
 2613                 error = vop_stdmountctl(ap);
 2614                 if (error)
 2615                         break;
 2616 
 2617                 usedbytes = *ap->a_res;
 2618 
 2619                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
 2620                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
 2621                                                     ap->a_buf,
 2622                                                     ap->a_buflen - usedbytes,
 2623                                                     &error);
 2624                 }
 2625 
 2626                 *ap->a_res += usedbytes;
 2627                 break;
 2628         }
 2629         default:
 2630                 error = vop_stdmountctl(ap);
 2631                 break;
 2632         }
 2633         lwkt_reltoken(&hmp->fs_token);
 2634         return(error);
 2635 }
 2636 
 2637 /*
 2638  * hammer_vop_strategy { vp, bio }
 2639  *
 2640  * Strategy call, used for regular file read & write only.  Note that the
 2641  * bp may represent a cluster.
 2642  *
 2643  * To simplify operation and allow better optimizations in the future,
 2644  * this code does not make any assumptions with regards to buffer alignment
 2645  * or size.
 2646  */
 2647 static
 2648 int
 2649 hammer_vop_strategy(struct vop_strategy_args *ap)
 2650 {
 2651         struct buf *bp;
 2652         int error;
 2653 
 2654         bp = ap->a_bio->bio_buf;
 2655 
 2656         switch(bp->b_cmd) {
 2657         case BUF_CMD_READ:
 2658                 error = hammer_vop_strategy_read(ap);
 2659                 break;
 2660         case BUF_CMD_WRITE:
 2661                 error = hammer_vop_strategy_write(ap);
 2662                 break;
 2663         default:
 2664                 bp->b_error = error = EINVAL;
 2665                 bp->b_flags |= B_ERROR;
 2666                 biodone(ap->a_bio);
 2667                 break;
 2668         }
 2669 
 2670         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
 2671 
 2672         return (error);
 2673 }
 2674 
 2675 /*
 2676  * Read from a regular file.  Iterate the related records and fill in the
 2677  * BIO/BUF.  Gaps are zero-filled.
 2678  *
 2679  * The support code in hammer_object.c should be used to deal with mixed
 2680  * in-memory and on-disk records.
 2681  *
 2682  * NOTE: Can be called from the cluster code with an oversized buf.
 2683  *
 2684  * XXX atime update
 2685  */
 2686 static
 2687 int
 2688 hammer_vop_strategy_read(struct vop_strategy_args *ap)
 2689 {
 2690         struct hammer_transaction trans;
 2691         struct hammer_inode *ip;
 2692         struct hammer_inode *dip;
 2693         hammer_mount_t hmp;
 2694         struct hammer_cursor cursor;
 2695         hammer_base_elm_t base;
 2696         hammer_off_t disk_offset;
 2697         struct bio *bio;
 2698         struct bio *nbio;
 2699         struct buf *bp;
 2700         int64_t rec_offset;
 2701         int64_t ran_end;
 2702         int64_t tmp64;
 2703         int error;
 2704         int boff;
 2705         int roff;
 2706         int n;
 2707         int isdedupable;
 2708 
 2709         bio = ap->a_bio;
 2710         bp = bio->bio_buf;
 2711         ip = ap->a_vp->v_data;
 2712         hmp = ip->hmp;
 2713 
 2714         /*
 2715          * The zone-2 disk offset may have been set by the cluster code via
 2716          * a BMAP operation, or else should be NOOFFSET.
 2717          *
 2718          * Checking the high bits for a match against zone-2 should suffice.
 2719          *
 2720          * In cases where a lot of data duplication is present it may be
 2721          * more beneficial to drop through and doubule-buffer through the
 2722          * device.
 2723          */
 2724         nbio = push_bio(bio);
 2725         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
 2726             HAMMER_ZONE_LARGE_DATA) {
 2727                 if (hammer_double_buffer == 0) {
 2728                         lwkt_gettoken(&hmp->fs_token);
 2729                         error = hammer_io_direct_read(hmp, nbio, NULL);
 2730                         lwkt_reltoken(&hmp->fs_token);
 2731                         return (error);
 2732                 }
 2733 
 2734                 /*
 2735                  * Try to shortcut requests for double_buffer mode too.
 2736                  * Since this mode runs through the device buffer cache
 2737                  * only compatible buffer sizes (meaning those generated
 2738                  * by normal filesystem buffers) are legal.
 2739                  */
 2740                 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
 2741                         lwkt_gettoken(&hmp->fs_token);
 2742                         error = hammer_io_indirect_read(hmp, nbio, NULL);
 2743                         lwkt_reltoken(&hmp->fs_token);
 2744                         return (error);
 2745                 }
 2746         }
 2747 
 2748         /*
 2749          * Well, that sucked.  Do it the hard way.  If all the stars are
 2750          * aligned we may still be able to issue a direct-read.
 2751          */
 2752         lwkt_gettoken(&hmp->fs_token);
 2753         hammer_simple_transaction(&trans, hmp);
 2754         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 2755 
 2756         /*
 2757          * Key range (begin and end inclusive) to scan.  Note that the key's
 2758          * stored in the actual records represent BASE+LEN, not BASE.  The
 2759          * first record containing bio_offset will have a key > bio_offset.
 2760          */
 2761         cursor.key_beg.localization = ip->obj_localization +
 2762                                       HAMMER_LOCALIZE_MISC;
 2763         cursor.key_beg.obj_id = ip->obj_id;
 2764         cursor.key_beg.create_tid = 0;
 2765         cursor.key_beg.delete_tid = 0;
 2766         cursor.key_beg.obj_type = 0;
 2767         cursor.key_beg.key = bio->bio_offset + 1;
 2768         cursor.asof = ip->obj_asof;
 2769         cursor.flags |= HAMMER_CURSOR_ASOF;
 2770 
 2771         cursor.key_end = cursor.key_beg;
 2772         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
 2773 #if 0
 2774         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
 2775                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
 2776                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
 2777                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
 2778         } else
 2779 #endif
 2780         {
 2781                 ran_end = bio->bio_offset + bp->b_bufsize;
 2782                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
 2783                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
 2784                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
 2785                 if (tmp64 < ran_end)
 2786                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
 2787                 else
 2788                         cursor.key_end.key = ran_end + MAXPHYS + 1;
 2789         }
 2790         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 2791 
 2792         /*
 2793          * Set NOSWAPCACHE for cursor data extraction if double buffering
 2794          * is disabled or (if the file is not marked cacheable via chflags
 2795          * and vm.swapcache_use_chflags is enabled).
 2796          */
 2797         if (hammer_double_buffer == 0 ||
 2798             ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
 2799              vm_swapcache_use_chflags)) {
 2800                 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
 2801         }
 2802 
 2803         error = hammer_ip_first(&cursor);
 2804         boff = 0;
 2805 
 2806         while (error == 0) {
 2807                 /*
 2808                  * Get the base file offset of the record.  The key for
 2809                  * data records is (base + bytes) rather then (base).
 2810                  */
 2811                 base = &cursor.leaf->base;
 2812                 rec_offset = base->key - cursor.leaf->data_len;
 2813 
 2814                 /*
 2815                  * Calculate the gap, if any, and zero-fill it.
 2816                  *
 2817                  * n is the offset of the start of the record verses our
 2818                  * current seek offset in the bio.
 2819                  */
 2820                 n = (int)(rec_offset - (bio->bio_offset + boff));
 2821                 if (n > 0) {
 2822                         if (n > bp->b_bufsize - boff)
 2823                                 n = bp->b_bufsize - boff;
 2824                         bzero((char *)bp->b_data + boff, n);
 2825                         boff += n;
 2826                         n = 0;
 2827                 }
 2828 
 2829                 /*
 2830                  * Calculate the data offset in the record and the number
 2831                  * of bytes we can copy.
 2832                  *
 2833                  * There are two degenerate cases.  First, boff may already
 2834                  * be at bp->b_bufsize.  Secondly, the data offset within
 2835                  * the record may exceed the record's size.
 2836                  */
 2837                 roff = -n;
 2838                 rec_offset += roff;
 2839                 n = cursor.leaf->data_len - roff;
 2840                 if (n <= 0) {
 2841                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
 2842                         n = 0;
 2843                 } else if (n > bp->b_bufsize - boff) {
 2844                         n = bp->b_bufsize - boff;
 2845                 }
 2846 
 2847                 /*
 2848                  * Deal with cached truncations.  This cool bit of code
 2849                  * allows truncate()/ftruncate() to avoid having to sync
 2850                  * the file.
 2851                  *
 2852                  * If the frontend is truncated then all backend records are
 2853                  * subject to the frontend's truncation.
 2854                  *
 2855                  * If the backend is truncated then backend records on-disk
 2856                  * (but not in-memory) are subject to the backend's
 2857                  * truncation.  In-memory records owned by the backend
 2858                  * represent data written after the truncation point on the
 2859                  * backend and must not be truncated.
 2860                  *
 2861                  * Truncate operations deal with frontend buffer cache
 2862                  * buffers and frontend-owned in-memory records synchronously.
 2863                  */
 2864                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
 2865                         if (hammer_cursor_ondisk(&cursor)/* ||
 2866                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
 2867                                 if (ip->trunc_off <= rec_offset)
 2868                                         n = 0;
 2869                                 else if (ip->trunc_off < rec_offset + n)
 2870                                         n = (int)(ip->trunc_off - rec_offset);
 2871                         }
 2872                 }
 2873                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
 2874                         if (hammer_cursor_ondisk(&cursor)) {
 2875                                 if (ip->sync_trunc_off <= rec_offset)
 2876                                         n = 0;
 2877                                 else if (ip->sync_trunc_off < rec_offset + n)
 2878                                         n = (int)(ip->sync_trunc_off - rec_offset);
 2879                         }
 2880                 }
 2881 
 2882                 /*
 2883                  * Try to issue a direct read into our bio if possible,
 2884                  * otherwise resolve the element data into a hammer_buffer
 2885                  * and copy.
 2886                  *
 2887                  * The buffer on-disk should be zerod past any real
 2888                  * truncation point, but may not be for any synthesized
 2889                  * truncation point from above.
 2890                  *
 2891                  * NOTE: disk_offset is only valid if the cursor data is
 2892                  *       on-disk.
 2893                  */
 2894                 disk_offset = cursor.leaf->data_offset + roff;
 2895                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
 2896                                hammer_cursor_ondisk(&cursor) &&
 2897                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
 2898 
 2899                 if (isdedupable && hammer_double_buffer == 0) {
 2900                         /*
 2901                          * Direct read case
 2902                          */
 2903                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
 2904                                  HAMMER_ZONE_LARGE_DATA);
 2905                         nbio->bio_offset = disk_offset;
 2906                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
 2907                         if (hammer_live_dedup && error == 0)
 2908                                 hammer_dedup_cache_add(ip, cursor.leaf);
 2909                         goto done;
 2910                 } else if (isdedupable) {
 2911                         /*
 2912                          * Async I/O case for reading from backing store
 2913                          * and copying the data to the filesystem buffer.
 2914                          * live-dedup has to verify the data anyway if it
 2915                          * gets a hit later so we can just add the entry
 2916                          * now.
 2917                          */
 2918                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
 2919                                  HAMMER_ZONE_LARGE_DATA);
 2920                         nbio->bio_offset = disk_offset;
 2921                         if (hammer_live_dedup)
 2922                                 hammer_dedup_cache_add(ip, cursor.leaf);
 2923                         error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
 2924                         goto done;
 2925                 } else if (n) {
 2926                         error = hammer_ip_resolve_data(&cursor);
 2927                         if (error == 0) {
 2928                                 if (hammer_live_dedup && isdedupable)
 2929                                         hammer_dedup_cache_add(ip, cursor.leaf);
 2930                                 bcopy((char *)cursor.data + roff,
 2931                                       (char *)bp->b_data + boff, n);
 2932                         }
 2933                 }
 2934                 if (error)
 2935                         break;
 2936 
 2937                 /*
 2938                  * We have to be sure that the only elements added to the
 2939                  * dedup cache are those which are already on-media.
 2940                  */
 2941                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
 2942                         hammer_dedup_cache_add(ip, cursor.leaf);
 2943 
 2944                 /*
 2945                  * Iterate until we have filled the request.
 2946                  */
 2947                 boff += n;
 2948                 if (boff == bp->b_bufsize)
 2949                         break;
 2950                 error = hammer_ip_next(&cursor);
 2951         }
 2952 
 2953         /*
 2954          * There may have been a gap after the last record
 2955          */
 2956         if (error == ENOENT)
 2957                 error = 0;
 2958         if (error == 0 && boff != bp->b_bufsize) {
 2959                 KKASSERT(boff < bp->b_bufsize);
 2960                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
 2961                 /* boff = bp->b_bufsize; */
 2962         }
 2963 
 2964         /*
 2965          * Disallow swapcache operation on the vnode buffer if double
 2966          * buffering is enabled, the swapcache will get the data via
 2967          * the block device buffer.
 2968          */
 2969         if (hammer_double_buffer)
 2970                 bp->b_flags |= B_NOTMETA;
 2971 
 2972         /*
 2973          * Cleanup
 2974          */
 2975         bp->b_resid = 0;
 2976         bp->b_error = error;
 2977         if (error)
 2978                 bp->b_flags |= B_ERROR;
 2979         biodone(ap->a_bio);
 2980 
 2981 done:
 2982         /*
 2983          * Cache the b-tree node for the last data read in cache[1].
 2984          *
 2985          * If we hit the file EOF then also cache the node in the
 2986          * governing director's cache[3], it will be used to initialize
 2987          * the inode's cache[1] for any inodes looked up via the directory.
 2988          *
 2989          * This doesn't reduce disk accesses since the B-Tree chain is
 2990          * likely cached, but it does reduce cpu overhead when looking
 2991          * up file offsets for cpdup/tar/cpio style iterations.
 2992          */
 2993         if (cursor.node)
 2994                 hammer_cache_node(&ip->cache[1], cursor.node);
 2995         if (ran_end >= ip->ino_data.size) {
 2996                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
 2997                                         ip->obj_asof, ip->obj_localization);
 2998                 if (dip) {
 2999                         hammer_cache_node(&dip->cache[3], cursor.node);
 3000                         hammer_rel_inode(dip, 0);
 3001                 }
 3002         }
 3003         hammer_done_cursor(&cursor);
 3004         hammer_done_transaction(&trans);
 3005         lwkt_reltoken(&hmp->fs_token);
 3006         return(error);
 3007 }
 3008 
 3009 /*
 3010  * BMAP operation - used to support cluster_read() only.
 3011  *
 3012  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
 3013  *
 3014  * This routine may return EOPNOTSUPP if the opration is not supported for
 3015  * the specified offset.  The contents of the pointer arguments do not
 3016  * need to be initialized in that case. 
 3017  *
 3018  * If a disk address is available and properly aligned return 0 with 
 3019  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
 3020  * to the run-length relative to that offset.  Callers may assume that
 3021  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
 3022  * large, so return EOPNOTSUPP if it is not sufficiently large.
 3023  */
 3024 static
 3025 int
 3026 hammer_vop_bmap(struct vop_bmap_args *ap)
 3027 {
 3028         struct hammer_transaction trans;
 3029         struct hammer_inode *ip;
 3030         hammer_mount_t hmp;
 3031         struct hammer_cursor cursor;
 3032         hammer_base_elm_t base;
 3033         int64_t rec_offset;
 3034         int64_t ran_end;
 3035         int64_t tmp64;
 3036         int64_t base_offset;
 3037         int64_t base_disk_offset;
 3038         int64_t last_offset;
 3039         hammer_off_t last_disk_offset;
 3040         hammer_off_t disk_offset;
 3041         int     rec_len;
 3042         int     error;
 3043         int     blksize;
 3044 
 3045         ++hammer_stats_file_iopsr;
 3046         ip = ap->a_vp->v_data;
 3047         hmp = ip->hmp;
 3048 
 3049         /*
 3050          * We can only BMAP regular files.  We can't BMAP database files,
 3051          * directories, etc.
 3052          */
 3053         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
 3054                 return(EOPNOTSUPP);
 3055 
 3056         /*
 3057          * bmap is typically called with runp/runb both NULL when used
 3058          * for writing.  We do not support BMAP for writing atm.
 3059          */
 3060         if (ap->a_cmd != BUF_CMD_READ)
 3061                 return(EOPNOTSUPP);
 3062 
 3063         /*
 3064          * Scan the B-Tree to acquire blockmap addresses, then translate
 3065          * to raw addresses.
 3066          */
 3067         lwkt_gettoken(&hmp->fs_token);
 3068         hammer_simple_transaction(&trans, hmp);
 3069 #if 0
 3070         kprintf("bmap_beg %016llx ip->cache %p\n",
 3071                 (long long)ap->a_loffset, ip->cache[1]);
 3072 #endif
 3073         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
 3074 
 3075         /*
 3076          * Key range (begin and end inclusive) to scan.  Note that the key's
 3077          * stored in the actual records represent BASE+LEN, not BASE.  The
 3078          * first record containing bio_offset will have a key > bio_offset.
 3079          */
 3080         cursor.key_beg.localization = ip->obj_localization +
 3081                                       HAMMER_LOCALIZE_MISC;
 3082         cursor.key_beg.obj_id = ip->obj_id;
 3083         cursor.key_beg.create_tid = 0;
 3084         cursor.key_beg.delete_tid = 0;
 3085         cursor.key_beg.obj_type = 0;
 3086         if (ap->a_runb)
 3087                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
 3088         else
 3089                 cursor.key_beg.key = ap->a_loffset + 1;
 3090         if (cursor.key_beg.key < 0)
 3091                 cursor.key_beg.key = 0;
 3092         cursor.asof = ip->obj_asof;
 3093         cursor.flags |= HAMMER_CURSOR_ASOF;
 3094 
 3095         cursor.key_end = cursor.key_beg;
 3096         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
 3097 
 3098         ran_end = ap->a_loffset + MAXPHYS;
 3099         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
 3100         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
 3101         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
 3102         if (tmp64 < ran_end)
 3103                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
 3104         else
 3105                 cursor.key_end.key = ran_end + MAXPHYS + 1;
 3106 
 3107         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
 3108 
 3109         error = hammer_ip_first(&cursor);
 3110         base_offset = last_offset = 0;
 3111         base_disk_offset = last_disk_offset = 0;
 3112 
 3113         while (error == 0) {
 3114                 /*
 3115                  * Get the base file offset of the record.  The key for
 3116                  * data records is (base + bytes) rather then (base).
 3117                  *
 3118                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
 3119                  * The extra bytes should be zero on-disk and the BMAP op
 3120                  * should still be ok.
 3121                  */
 3122                 base = &cursor.leaf->base;
 3123                 rec_offset = base->key - cursor.leaf->data_len;
 3124                 rec_len    = cursor.leaf->data_len;
 3125 
 3126                 /*
 3127                  * Incorporate any cached truncation.
 3128                  *
 3129                  * NOTE: Modifications to rec_len based on synthesized
 3130                  * truncation points remove the guarantee that any extended
 3131                  * data on disk is zero (since the truncations may not have
 3132                  * taken place on-media yet).
 3133                  */
 3134                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
 3135                         if (hammer_cursor_ondisk(&cursor) ||
 3136                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
 3137                                 if (ip->trunc_off <= rec_offset)
 3138                                         rec_len = 0;
 3139                                 else if (ip->trunc_off < rec_offset + rec_len)
 3140                                         rec_len = (int)(ip->trunc_off - rec_offset);
 3141                         }
 3142                 }
 3143                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
 3144                         if (hammer_cursor_ondisk(&cursor)) {
 3145                                 if (ip->sync_trunc_off <= rec_offset)
 3146                                         rec_len = 0;
 3147                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
 3148                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
 3149                         }
 3150                 }
 3151 
 3152                 /*
 3153                  * Accumulate information.  If we have hit a discontiguous
 3154                  * block reset base_offset unless we are already beyond the
 3155                  * requested offset.  If we are, that's it, we stop.
 3156                  */
 3157                 if (error)
 3158                         break;
 3159                 if (hammer_cursor_ondisk(&cursor)) {
 3160                         disk_offset = cursor.leaf->data_offset;
 3161                         if (rec_offset != last_offset ||
 3162                             disk_offset != last_disk_offset) {
 3163                                 if (rec_offset > ap->a_loffset)
 3164                                         break;
 3165                                 base_offset = rec_offset;
 3166                                 base_disk_offset = disk_offset;
 3167                         }
 3168                         last_offset = rec_offset + rec_len;
 3169                         last_disk_offset = disk_offset + rec_len;
 3170 
 3171                         if (hammer_live_dedup)
 3172                                 hammer_dedup_cache_add(ip, cursor.leaf);
 3173                 }
 3174                 
 3175                 error = hammer_ip_next(&cursor);
 3176         }
 3177 
 3178 #if 0
 3179         kprintf("BMAP %016llx:  %016llx - %016llx\n",
 3180                 (long long)ap->a_loffset,
 3181                 (long long)base_offset,
 3182                 (long long)last_offset);
 3183         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
 3184                 (long long)base_disk_offset,
 3185                 (long long)last_disk_offset);
 3186 #endif
 3187 
 3188         if (cursor.node) {
 3189                 hammer_cache_node(&ip->cache[1], cursor.node);
 3190 #if 0
 3191                 kprintf("bmap_end2 %016llx ip->cache %p\n",
 3192                         (long long)ap->a_loffset, ip->cache[1]);
 3193 #endif
 3194         }
 3195         hammer_done_cursor(&cursor);
 3196         hammer_done_transaction(&trans);
 3197         lwkt_reltoken(&hmp->fs_token);
 3198 
 3199         /*
 3200          * If we couldn't find any records or the records we did find were
 3201          * all behind the requested offset, return failure.  A forward
 3202          * truncation can leave a hole w/ no on-disk records.
 3203          */
 3204         if (last_offset == 0 || last_offset < ap->a_loffset)
 3205                 return (EOPNOTSUPP);
 3206 
 3207         /*
 3208          * Figure out the block size at the requested offset and adjust
 3209          * our limits so the cluster_read() does not create inappropriately
 3210          * sized buffer cache buffers.
 3211          */
 3212         blksize = hammer_blocksize(ap->a_loffset);
 3213         if (hammer_blocksize(base_offset) != blksize) {
 3214                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
 3215         }
 3216         if (last_offset != ap->a_loffset &&
 3217             hammer_blocksize(last_offset - 1) != blksize) {
 3218                 last_offset = hammer_blockdemarc(ap->a_loffset,
 3219                                                  last_offset - 1);
 3220         }
 3221 
 3222         /*
 3223          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
 3224          * from occuring.
 3225          */
 3226         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
 3227 
 3228         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
 3229                 /*
 3230                  * Only large-data zones can be direct-IOd
 3231                  */
 3232                 error = EOPNOTSUPP;
 3233         } else if ((disk_offset & HAMMER_BUFMASK) ||
 3234                    (last_offset - ap->a_loffset) < blksize) {
 3235                 /*
 3236                  * doffsetp is not aligned or the forward run size does
 3237                  * not cover a whole buffer, disallow the direct I/O.
 3238                  */
 3239                 error = EOPNOTSUPP;
 3240         } else {
 3241                 /*
 3242                  * We're good.
 3243                  */
 3244                 *ap->a_doffsetp = disk_offset;
 3245                 if (ap->a_runb) {
 3246                         *ap->a_runb = ap->a_loffset - base_offset;
 3247                         KKASSERT(*ap->a_runb >= 0);
 3248                 }
 3249                 if (ap->a_runp) {
 3250                         *ap->a_runp = last_offset - ap->a_loffset;
 3251                         KKASSERT(*ap->a_runp >= 0);
 3252                 }
 3253                 error = 0;
 3254         }
 3255         return(error);
 3256 }
 3257 
 3258 /*
 3259  * Write to a regular file.   Because this is a strategy call the OS is
 3260  * trying to actually get data onto the media.
 3261  */
 3262 static
 3263 int
 3264 hammer_vop_strategy_write(struct vop_strategy_args *ap)
 3265 {
 3266         hammer_record_t record;
 3267         hammer_mount_t hmp;
 3268         hammer_inode_t ip;
 3269         struct bio *bio;
 3270         struct buf *bp;
 3271         int blksize __debugvar;
 3272         int bytes;
 3273         int error;
 3274 
 3275         bio = ap->a_bio;
 3276         bp = bio->bio_buf;
 3277         ip = ap->a_vp->v_data;
 3278         hmp = ip->hmp;
 3279 
 3280         blksize = hammer_blocksize(bio->bio_offset);
 3281         KKASSERT(bp->b_bufsize == blksize);
 3282 
 3283         if (ip->flags & HAMMER_INODE_RO) {
 3284                 bp->b_error = EROFS;
 3285                 bp->b_flags |= B_ERROR;
 3286                 biodone(ap->a_bio);
 3287                 return(EROFS);
 3288         }
 3289 
 3290         lwkt_gettoken(&hmp->fs_token);
 3291 
 3292         /*
 3293          * Disallow swapcache operation on the vnode buffer if double
 3294          * buffering is enabled, the swapcache will get the data via
 3295          * the block device buffer.
 3296          */
 3297         if (hammer_double_buffer)
 3298                 bp->b_flags |= B_NOTMETA;
 3299 
 3300         /*
 3301          * Interlock with inode destruction (no in-kernel or directory
 3302          * topology visibility).  If we queue new IO while trying to
 3303          * destroy the inode we can deadlock the vtrunc call in
 3304          * hammer_inode_unloadable_check().
 3305          *
 3306          * Besides, there's no point flushing a bp associated with an
 3307          * inode that is being destroyed on-media and has no kernel
 3308          * references.
 3309          */
 3310         if ((ip->flags | ip->sync_flags) &
 3311             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
 3312                 bp->b_resid = 0;
 3313                 biodone(ap->a_bio);
 3314                 lwkt_reltoken(&hmp->fs_token);
 3315                 return(0);
 3316         }
 3317 
 3318         /*
 3319          * Reserve space and issue a direct-write from the front-end. 
 3320          * NOTE: The direct_io code will hammer_bread/bcopy smaller
 3321          * allocations.
 3322          *
 3323          * An in-memory record will be installed to reference the storage
 3324          * until the flusher can get to it.
 3325          *
 3326          * Since we own the high level bio the front-end will not try to
 3327          * do a direct-read until the write completes.
 3328          *
 3329          * NOTE: The only time we do not reserve a full-sized buffers
 3330          * worth of data is if the file is small.  We do not try to
 3331          * allocate a fragment (from the small-data zone) at the end of
 3332          * an otherwise large file as this can lead to wildly separated
 3333          * data.
 3334          */
 3335         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
 3336         KKASSERT(bio->bio_offset < ip->ino_data.size);
 3337         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
 3338                 bytes = bp->b_bufsize;
 3339         else
 3340                 bytes = ((int)ip->ino_data.size + 15) & ~15;
 3341 
 3342         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
 3343                                     bytes, &error);
 3344 
 3345         /*
 3346          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
 3347          * in hammer_vop_write().  We must flag the record so the proper
 3348          * REDO_TERM_WRITE entry is generated during the flush.
 3349          */
 3350         if (record) {
 3351                 if (bp->b_flags & B_VFSFLAG1) {
 3352                         record->flags |= HAMMER_RECF_REDO;
 3353                         bp->b_flags &= ~B_VFSFLAG1;
 3354                 }
 3355                 if (record->flags & HAMMER_RECF_DEDUPED) {
 3356                         bp->b_resid = 0;
 3357                         hammer_ip_replace_bulk(hmp, record);
 3358                         biodone(ap->a_bio);
 3359                 } else {
 3360                         hammer_io_direct_write(hmp, bio, record);
 3361                 }
 3362                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
 3363                         hammer_flush_inode(ip, 0);
 3364         } else {
 3365                 bp->b_bio2.bio_offset = NOOFFSET;
 3366                 bp->b_error = error;
 3367                 bp->b_flags |= B_ERROR;
 3368                 biodone(ap->a_bio);
 3369         }
 3370         lwkt_reltoken(&hmp->fs_token);
 3371         return(error);
 3372 }
 3373 
 3374 /*
 3375  * dounlink - disconnect a directory entry
 3376  *
 3377  * XXX whiteout support not really in yet
 3378  */
 3379 static int
 3380 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
 3381                 struct vnode *dvp, struct ucred *cred, 
 3382                 int flags, int isdir)
 3383 {
 3384         struct namecache *ncp;
 3385         hammer_inode_t dip;
 3386         hammer_inode_t ip;
 3387         hammer_mount_t hmp;
 3388         struct hammer_cursor cursor;
 3389         int64_t namekey;
 3390         u_int32_t max_iterations;
 3391         int nlen, error;
 3392 
 3393         /*
 3394          * Calculate the namekey and setup the key range for the scan.  This
 3395          * works kinda like a chained hash table where the lower 32 bits
 3396          * of the namekey synthesize the chain.
 3397          *
 3398          * The key range is inclusive of both key_beg and key_end.
 3399          */
 3400         dip = VTOI(dvp);
 3401         ncp = nch->ncp;
 3402         hmp = dip->hmp;
 3403 
 3404         if (dip->flags & HAMMER_INODE_RO)
 3405                 return (EROFS);
 3406 
 3407         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
 3408                                            &max_iterations);
 3409 retry:
 3410         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
 3411         cursor.key_beg.localization = dip->obj_localization +
 3412                                       hammer_dir_localization(dip);
 3413         cursor.key_beg.obj_id = dip->obj_id;
 3414         cursor.key_beg.key = namekey;
 3415         cursor.key_beg.create_tid = 0;
 3416         cursor.key_beg.delete_tid = 0;
 3417         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
 3418         cursor.key_beg.obj_type = 0;
 3419 
 3420         cursor.key_end = cursor.key_beg;
 3421         cursor.key_end.key += max_iterations;
 3422         cursor.asof = dip->obj_asof;
 3423         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
 3424 
 3425         /*
 3426          * Scan all matching records (the chain), locate the one matching
 3427          * the requested path component.  info->last_error contains the
 3428          * error code on search termination and could be 0, ENOENT, or
 3429          * something else.
 3430          *
 3431          * The hammer_ip_*() functions merge in-memory records with on-disk
 3432          * records for the purposes of the search.
 3433          */
 3434         error = hammer_ip_first(&cursor);
 3435 
 3436         while (error == 0) {
 3437                 error = hammer_ip_resolve_data(&cursor);
 3438                 if (error)
 3439                         break;
 3440                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
 3441                 KKASSERT(nlen > 0);
 3442                 if (ncp->nc_nlen == nlen &&
 3443                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
 3444                         break;
 3445                 }
 3446                 error = hammer_ip_next(&cursor);
 3447         }
 3448 
 3449         /*
 3450          * If all is ok we have to get the inode so we can adjust nlinks.
 3451          * To avoid a deadlock with the flusher we must release the inode
 3452          * lock on the directory when acquiring the inode for the entry.
 3453          *
 3454          * If the target is a directory, it must be empty.
 3455          */
 3456         if (error == 0) {
 3457                 hammer_unlock(&cursor.ip->lock);
 3458                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
 3459                                       hmp->asof,
 3460                                       cursor.data->entry.localization,
 3461                                       0, &error);
 3462                 hammer_lock_sh(&cursor.ip->lock);
 3463                 if (error == ENOENT) {
 3464                         kprintf("HAMMER: WARNING: Removing "
 3465                                 "dirent w/missing inode \"%s\"\n"
 3466                                 "\tobj_id = %016llx\n",
 3467                                 ncp->nc_name,
 3468                                 (long long)cursor.data->entry.obj_id);
 3469                         error = 0;
 3470                 }
 3471 
 3472                 /*
 3473                  * If isdir >= 0 we validate that the entry is or is not a
 3474                  * directory.  If isdir < 0 we don't care.
 3475                  */
 3476                 if (error == 0 && isdir >= 0 && ip) {
 3477                         if (isdir &&
 3478                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
 3479                                 error = ENOTDIR;
 3480                         } else if (isdir == 0 &&
 3481                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
 3482                                 error = EISDIR;
 3483                         }
 3484                 }
 3485 
 3486                 /*
 3487                  * If we are trying to remove a directory the directory must
 3488                  * be empty.
 3489                  *
 3490                  * The check directory code can loop and deadlock/retry.  Our
 3491                  * own cursor's node locks must be released to avoid a 3-way
 3492                  * deadlock with the flusher if the check directory code
 3493                  * blocks.
 3494                  *
 3495                  * If any changes whatsoever have been made to the cursor
 3496                  * set EDEADLK and retry.
 3497                  *
 3498                  * WARNING: See warnings in hammer_unlock_cursor()
 3499                  *          function.
 3500                  */
 3501                 if (error == 0 && ip && ip->ino_data.obj_type ==
 3502                                         HAMMER_OBJTYPE_DIRECTORY) {
 3503                         hammer_unlock_cursor(&cursor);
 3504                         error = hammer_ip_check_directory_empty(trans, ip);
 3505                         hammer_lock_cursor(&cursor);
 3506                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
 3507                                 kprintf("HAMMER: Warning: avoided deadlock "
 3508                                         "on rmdir '%s'\n",
 3509                                         ncp->nc_name);
 3510                                 error = EDEADLK;
 3511                         }
 3512                 }
 3513 
 3514                 /*
 3515                  * Delete the directory entry.
 3516                  *
 3517                  * WARNING: hammer_ip_del_directory() may have to terminate
 3518                  * the cursor to avoid a deadlock.  It is ok to call
 3519                  * hammer_done_cursor() twice.
 3520                  */
 3521                 if (error == 0) {
 3522                         error = hammer_ip_del_directory(trans, &cursor,
 3523                                                         dip, ip);
 3524                 }
 3525                 hammer_done_cursor(&cursor);
 3526                 if (error == 0) {
 3527                         /*
 3528                          * Tell the namecache that we are now unlinked.
 3529                          */
 3530                         cache_unlink(nch);
 3531 
 3532                         /*
 3533                          * NOTE: ip->vp, if non-NULL, cannot be directly
 3534                          *       referenced without formally acquiring the
 3535                          *       vp since the vp might have zero refs on it,
 3536                          *       or in the middle of a reclaim, etc.
 3537                          *
 3538                          * NOTE: The cache_setunresolved() can rip the vp
 3539                          *       out from under us since the vp may not have
 3540                          *       any refs, in which case ip->vp will be NULL
 3541                          *       from the outset.
 3542                          */
 3543                         while (ip && ip->vp) {
 3544                                 struct vnode *vp;
 3545 
 3546                                 error = hammer_get_vnode(ip, &vp);
 3547                                 if (error == 0 && vp) {
 3548                                         vn_unlock(vp);
 3549                                         hammer_knote(ip->vp, NOTE_DELETE);
 3550 #if 0
 3551                                         /*
 3552                                          * Don't do this, it can deadlock
 3553                                          * on concurrent rm's of hardlinks.
 3554                                          * Shouldn't be needed any more.
 3555                                          */
 3556                                         cache_inval_vp(ip->vp, CINV_DESTROY);
 3557 #endif
 3558                                         vrele(vp);
 3559                                         break;
 3560                                 }
 3561                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
 3562                         }
 3563                 }
 3564                 if (ip)
 3565                         hammer_rel_inode(ip, 0);
 3566         } else {
 3567                 hammer_done_cursor(&cursor);
 3568         }
 3569         if (error == EDEADLK)
 3570                 goto retry;
 3571 
 3572         return (error);
 3573 }
 3574 
 3575 /************************************************************************
 3576  *                          FIFO AND SPECFS OPS                         *
 3577  ************************************************************************
 3578  *
 3579  */
 3580 static int
 3581 hammer_vop_fifoclose (struct vop_close_args *ap)
 3582 {
 3583         /* XXX update itimes */
 3584         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
 3585 }
 3586 
 3587 static int
 3588 hammer_vop_fiforead (struct vop_read_args *ap)
 3589 {
 3590         int error;
 3591 
 3592         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
 3593         /* XXX update access time */
 3594         return (error);
 3595 }
 3596 
 3597 static int
 3598 hammer_vop_fifowrite (struct vop_write_args *ap)
 3599 {
 3600         int error;
 3601 
 3602         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
 3603         /* XXX update access time */
 3604         return (error);
 3605 }
 3606 
 3607 static
 3608 int
 3609 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
 3610 {
 3611         int error;
 3612 
 3613         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
 3614         if (error)
 3615                 error = hammer_vop_kqfilter(ap);
 3616         return(error);
 3617 }
 3618 
 3619 /************************************************************************
 3620  *                          KQFILTER OPS                                *
 3621  ************************************************************************
 3622  *
 3623  */
 3624 static void filt_hammerdetach(struct knote *kn);
 3625 static int filt_hammerread(struct knote *kn, long hint);
 3626 static int filt_hammerwrite(struct knote *kn, long hint);
 3627 static int filt_hammervnode(struct knote *kn, long hint);
 3628 
 3629 static struct filterops hammerread_filtops =
 3630         { FILTEROP_ISFD | FILTEROP_MPSAFE,
 3631           NULL, filt_hammerdetach, filt_hammerread };
 3632 static struct filterops hammerwrite_filtops =
 3633         { FILTEROP_ISFD | FILTEROP_MPSAFE,
 3634           NULL, filt_hammerdetach, filt_hammerwrite };
 3635 static struct filterops hammervnode_filtops =
 3636         { FILTEROP_ISFD | FILTEROP_MPSAFE,
 3637           NULL, filt_hammerdetach, filt_hammervnode };
 3638 
 3639 static
 3640 int
 3641 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
 3642 {
 3643         struct vnode *vp = ap->a_vp;
 3644         struct knote *kn = ap->a_kn;
 3645 
 3646         switch (kn->kn_filter) {
 3647         case EVFILT_READ:
 3648                 kn->kn_fop = &hammerread_filtops;
 3649                 break;
 3650         case EVFILT_WRITE:
 3651                 kn->kn_fop = &hammerwrite_filtops;
 3652                 break;
 3653         case EVFILT_VNODE:
 3654                 kn->kn_fop = &hammervnode_filtops;
 3655                 break;
 3656         default:
 3657                 return (EOPNOTSUPP);
 3658         }
 3659 
 3660         kn->kn_hook = (caddr_t)vp;
 3661 
 3662         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
 3663 
 3664         return(0);
 3665 }
 3666 
 3667 static void
 3668 filt_hammerdetach(struct knote *kn)
 3669 {
 3670         struct vnode *vp = (void *)kn->kn_hook;
 3671 
 3672         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
 3673 }
 3674 
 3675 static int
 3676 filt_hammerread(struct knote *kn, long hint)
 3677 {
 3678         struct vnode *vp = (void *)kn->kn_hook;
 3679         hammer_inode_t ip = VTOI(vp);
 3680         hammer_mount_t hmp = ip->hmp;
 3681         off_t off;
 3682 
 3683         if (hint == NOTE_REVOKE) {
 3684                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
 3685                 return(1);
 3686         }
 3687         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
 3688         off = ip->ino_data.size - kn->kn_fp->f_offset;
 3689         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
 3690         lwkt_reltoken(&hmp->fs_token);
 3691         if (kn->kn_sfflags & NOTE_OLDAPI)
 3692                 return(1);
 3693         return (kn->kn_data != 0);
 3694 }
 3695 
 3696 static int
 3697 filt_hammerwrite(struct knote *kn, long hint)
 3698 {
 3699         if (hint == NOTE_REVOKE)
 3700                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
 3701         kn->kn_data = 0;
 3702         return (1);
 3703 }
 3704 
 3705 static int
 3706 filt_hammervnode(struct knote *kn, long hint)
 3707 {
 3708         if (kn->kn_sfflags & hint)
 3709                 kn->kn_fflags |= hint;
 3710         if (hint == NOTE_REVOKE) {
 3711                 kn->kn_flags |= (EV_EOF | EV_NODATA);
 3712                 return (1);
 3713         }
 3714         return (kn->kn_fflags != 0);
 3715 }
 3716 

Cache object: 0b99dab3f554bcd276a95555e4d3072d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.