The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   24  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   25  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
   26  * Copyright 2017 Nexenta Systems, Inc.
   27  */
   28 
   29 /* Portions Copyright 2007 Jeremy Teo */
   30 /* Portions Copyright 2010 Robert Milkowski */
   31 
   32 
   33 #include <sys/types.h>
   34 #include <sys/param.h>
   35 #include <sys/time.h>
   36 #include <sys/sysmacros.h>
   37 #include <sys/vfs.h>
   38 #include <sys/file.h>
   39 #include <sys/stat.h>
   40 #include <sys/kmem.h>
   41 #include <sys/taskq.h>
   42 #include <sys/uio.h>
   43 #include <sys/vmsystm.h>
   44 #include <sys/atomic.h>
   45 #include <sys/pathname.h>
   46 #include <sys/cmn_err.h>
   47 #include <sys/errno.h>
   48 #include <sys/zfs_dir.h>
   49 #include <sys/zfs_acl.h>
   50 #include <sys/zfs_ioctl.h>
   51 #include <sys/fs/zfs.h>
   52 #include <sys/dmu.h>
   53 #include <sys/dmu_objset.h>
   54 #include <sys/spa.h>
   55 #include <sys/txg.h>
   56 #include <sys/dbuf.h>
   57 #include <sys/zap.h>
   58 #include <sys/sa.h>
   59 #include <sys/policy.h>
   60 #include <sys/sunddi.h>
   61 #include <sys/sid.h>
   62 #include <sys/zfs_ctldir.h>
   63 #include <sys/zfs_fuid.h>
   64 #include <sys/zfs_quota.h>
   65 #include <sys/zfs_sa.h>
   66 #include <sys/zfs_vnops.h>
   67 #include <sys/zfs_rlock.h>
   68 #include <sys/cred.h>
   69 #include <sys/zpl.h>
   70 #include <sys/zil.h>
   71 #include <sys/sa_impl.h>
   72 
   73 /*
   74  * Programming rules.
   75  *
   76  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
   77  * properly lock its in-core state, create a DMU transaction, do the work,
   78  * record this work in the intent log (ZIL), commit the DMU transaction,
   79  * and wait for the intent log to commit if it is a synchronous operation.
   80  * Moreover, the vnode ops must work in both normal and log replay context.
   81  * The ordering of events is important to avoid deadlocks and references
   82  * to freed memory.  The example below illustrates the following Big Rules:
   83  *
   84  *  (1) A check must be made in each zfs thread for a mounted file system.
   85  *      This is done avoiding races using zfs_enter(zfsvfs).
   86  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
   87  *      must be checked with zfs_verify_zp(zp).  Both of these macros
   88  *      can return EIO from the calling function.
   89  *
   90  *  (2) zrele() should always be the last thing except for zil_commit() (if
   91  *      necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
   92  *      last reference, the vnode/znode can be freed, so the zp may point to
   93  *      freed memory.  Second, the last reference will call zfs_zinactive(),
   94  *      which may induce a lot of work -- pushing cached pages (which acquires
   95  *      range locks) and syncing out cached atime changes.  Third,
   96  *      zfs_zinactive() may require a new tx, which could deadlock the system
   97  *      if you were already holding one. This deadlock occurs because the tx
   98  *      currently being operated on prevents a txg from syncing, which
   99  *      prevents the new tx from progressing, resulting in a deadlock.  If you
  100  *      must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
  101  *      is a synonym for zrele().
  102  *
  103  *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
  104  *      as they can span dmu_tx_assign() calls.
  105  *
  106  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  107  *      dmu_tx_assign().  This is critical because we don't want to block
  108  *      while holding locks.
  109  *
  110  *      If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
  111  *      reduces lock contention and CPU usage when we must wait (note that if
  112  *      throughput is constrained by the storage, nearly every transaction
  113  *      must wait).
  114  *
  115  *      Note, in particular, that if a lock is sometimes acquired before
  116  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  117  *      to use a non-blocking assign can deadlock the system.  The scenario:
  118  *
  119  *      Thread A has grabbed a lock before calling dmu_tx_assign().
  120  *      Thread B is in an already-assigned tx, and blocks for this lock.
  121  *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  122  *      forever, because the previous txg can't quiesce until B's tx commits.
  123  *
  124  *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  125  *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  126  *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  127  *      to indicate that this operation has already called dmu_tx_wait().
  128  *      This will ensure that we don't retry forever, waiting a short bit
  129  *      each time.
  130  *
  131  *  (5) If the operation succeeded, generate the intent log entry for it
  132  *      before dropping locks.  This ensures that the ordering of events
  133  *      in the intent log matches the order in which they actually occurred.
  134  *      During ZIL replay the zfs_log_* functions will update the sequence
  135  *      number to indicate the zil transaction has replayed.
  136  *
  137  *  (6) At the end of each vnode op, the DMU tx must always commit,
  138  *      regardless of whether there were any errors.
  139  *
  140  *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
  141  *      to ensure that synchronous semantics are provided when necessary.
  142  *
  143  * In general, this is how things should be ordered in each vnode op:
  144  *
  145  *      zfs_enter(zfsvfs);              // exit if unmounted
  146  * top:
  147  *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may igrab())
  148  *      rw_enter(...);                  // grab any other locks you need
  149  *      tx = dmu_tx_create(...);        // get DMU tx
  150  *      dmu_tx_hold_*();                // hold each object you might modify
  151  *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  152  *      if (error) {
  153  *              rw_exit(...);           // drop locks
  154  *              zfs_dirent_unlock(dl);  // unlock directory entry
  155  *              zrele(...);             // release held znodes
  156  *              if (error == ERESTART) {
  157  *                      waited = B_TRUE;
  158  *                      dmu_tx_wait(tx);
  159  *                      dmu_tx_abort(tx);
  160  *                      goto top;
  161  *              }
  162  *              dmu_tx_abort(tx);       // abort DMU tx
  163  *              zfs_exit(zfsvfs);       // finished in zfs
  164  *              return (error);         // really out of space
  165  *      }
  166  *      error = do_real_work();         // do whatever this VOP does
  167  *      if (error == 0)
  168  *              zfs_log_*(...);         // on success, make ZIL entry
  169  *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
  170  *      rw_exit(...);                   // drop locks
  171  *      zfs_dirent_unlock(dl);          // unlock directory entry
  172  *      zrele(...);                     // release held znodes
  173  *      zil_commit(zilog, foid);        // synchronous when necessary
  174  *      zfs_exit(zfsvfs);               // finished in zfs
  175  *      return (error);                 // done, report error
  176  */
  177 int
  178 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
  179 {
  180         (void) cr;
  181         znode_t *zp = ITOZ(ip);
  182         zfsvfs_t *zfsvfs = ITOZSB(ip);
  183         int error;
  184 
  185         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
  186                 return (error);
  187 
  188         /* Honor ZFS_APPENDONLY file attribute */
  189         if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
  190             ((flag & O_APPEND) == 0)) {
  191                 zfs_exit(zfsvfs, FTAG);
  192                 return (SET_ERROR(EPERM));
  193         }
  194 
  195         /* Keep a count of the synchronous opens in the znode */
  196         if (flag & O_SYNC)
  197                 atomic_inc_32(&zp->z_sync_cnt);
  198 
  199         zfs_exit(zfsvfs, FTAG);
  200         return (0);
  201 }
  202 
  203 int
  204 zfs_close(struct inode *ip, int flag, cred_t *cr)
  205 {
  206         (void) cr;
  207         znode_t *zp = ITOZ(ip);
  208         zfsvfs_t *zfsvfs = ITOZSB(ip);
  209         int error;
  210 
  211         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
  212                 return (error);
  213 
  214         /* Decrement the synchronous opens in the znode */
  215         if (flag & O_SYNC)
  216                 atomic_dec_32(&zp->z_sync_cnt);
  217 
  218         zfs_exit(zfsvfs, FTAG);
  219         return (0);
  220 }
  221 
  222 #if defined(_KERNEL)
  223 /*
  224  * When a file is memory mapped, we must keep the IO data synchronized
  225  * between the DMU cache and the memory mapped pages.  What this means:
  226  *
  227  * On Write:    If we find a memory mapped page, we write to *both*
  228  *              the page and the dmu buffer.
  229  */
  230 void
  231 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
  232 {
  233         struct inode *ip = ZTOI(zp);
  234         struct address_space *mp = ip->i_mapping;
  235         struct page *pp;
  236         uint64_t nbytes;
  237         int64_t off;
  238         void *pb;
  239 
  240         off = start & (PAGE_SIZE-1);
  241         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
  242                 nbytes = MIN(PAGE_SIZE - off, len);
  243 
  244                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
  245                 if (pp) {
  246                         if (mapping_writably_mapped(mp))
  247                                 flush_dcache_page(pp);
  248 
  249                         pb = kmap(pp);
  250                         (void) dmu_read(os, zp->z_id, start + off, nbytes,
  251                             pb + off, DMU_READ_PREFETCH);
  252                         kunmap(pp);
  253 
  254                         if (mapping_writably_mapped(mp))
  255                                 flush_dcache_page(pp);
  256 
  257                         mark_page_accessed(pp);
  258                         SetPageUptodate(pp);
  259                         ClearPageError(pp);
  260                         unlock_page(pp);
  261                         put_page(pp);
  262                 }
  263 
  264                 len -= nbytes;
  265                 off = 0;
  266         }
  267 }
  268 
  269 /*
  270  * When a file is memory mapped, we must keep the IO data synchronized
  271  * between the DMU cache and the memory mapped pages.  What this means:
  272  *
  273  * On Read:     We "read" preferentially from memory mapped pages,
  274  *              else we default from the dmu buffer.
  275  *
  276  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  277  *       the file is memory mapped.
  278  */
  279 int
  280 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
  281 {
  282         struct inode *ip = ZTOI(zp);
  283         struct address_space *mp = ip->i_mapping;
  284         struct page *pp;
  285         int64_t start, off;
  286         uint64_t bytes;
  287         int len = nbytes;
  288         int error = 0;
  289         void *pb;
  290 
  291         start = uio->uio_loffset;
  292         off = start & (PAGE_SIZE-1);
  293         for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
  294                 bytes = MIN(PAGE_SIZE - off, len);
  295 
  296                 pp = find_lock_page(mp, start >> PAGE_SHIFT);
  297                 if (pp) {
  298                         ASSERT(PageUptodate(pp));
  299                         unlock_page(pp);
  300 
  301                         pb = kmap(pp);
  302                         error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
  303                         kunmap(pp);
  304 
  305                         if (mapping_writably_mapped(mp))
  306                                 flush_dcache_page(pp);
  307 
  308                         mark_page_accessed(pp);
  309                         put_page(pp);
  310                 } else {
  311                         error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
  312                             uio, bytes);
  313                 }
  314 
  315                 len -= bytes;
  316                 off = 0;
  317                 if (error)
  318                         break;
  319         }
  320         return (error);
  321 }
  322 #endif /* _KERNEL */
  323 
  324 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
  325 
  326 /*
  327  * Write the bytes to a file.
  328  *
  329  *      IN:     zp      - znode of file to be written to
  330  *              data    - bytes to write
  331  *              len     - number of bytes to write
  332  *              pos     - offset to start writing at
  333  *
  334  *      OUT:    resid   - remaining bytes to write
  335  *
  336  *      RETURN: 0 if success
  337  *              positive error code if failure.  EIO is returned
  338  *              for a short write when residp isn't provided.
  339  *
  340  * Timestamps:
  341  *      zp - ctime|mtime updated if byte count > 0
  342  */
  343 int
  344 zfs_write_simple(znode_t *zp, const void *data, size_t len,
  345     loff_t pos, size_t *residp)
  346 {
  347         fstrans_cookie_t cookie;
  348         int error;
  349 
  350         struct iovec iov;
  351         iov.iov_base = (void *)data;
  352         iov.iov_len = len;
  353 
  354         zfs_uio_t uio;
  355         zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
  356 
  357         cookie = spl_fstrans_mark();
  358         error = zfs_write(zp, &uio, 0, kcred);
  359         spl_fstrans_unmark(cookie);
  360 
  361         if (error == 0) {
  362                 if (residp != NULL)
  363                         *residp = zfs_uio_resid(&uio);
  364                 else if (zfs_uio_resid(&uio) != 0)
  365                         error = SET_ERROR(EIO);
  366         }
  367 
  368         return (error);
  369 }
  370 
  371 static void
  372 zfs_rele_async_task(void *arg)
  373 {
  374         iput(arg);
  375 }
  376 
  377 void
  378 zfs_zrele_async(znode_t *zp)
  379 {
  380         struct inode *ip = ZTOI(zp);
  381         objset_t *os = ITOZSB(ip)->z_os;
  382 
  383         ASSERT(atomic_read(&ip->i_count) > 0);
  384         ASSERT(os != NULL);
  385 
  386         /*
  387          * If decrementing the count would put us at 0, we can't do it inline
  388          * here, because that would be synchronous. Instead, dispatch an iput
  389          * to run later.
  390          *
  391          * For more information on the dangers of a synchronous iput, see the
  392          * header comment of this file.
  393          */
  394         if (!atomic_add_unless(&ip->i_count, -1, 1)) {
  395                 VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
  396                     zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
  397         }
  398 }
  399 
  400 
  401 /*
  402  * Lookup an entry in a directory, or an extended attribute directory.
  403  * If it exists, return a held inode reference for it.
  404  *
  405  *      IN:     zdp     - znode of directory to search.
  406  *              nm      - name of entry to lookup.
  407  *              flags   - LOOKUP_XATTR set if looking for an attribute.
  408  *              cr      - credentials of caller.
  409  *              direntflags - directory lookup flags
  410  *              realpnp - returned pathname.
  411  *
  412  *      OUT:    zpp     - znode of located entry, NULL if not found.
  413  *
  414  *      RETURN: 0 on success, error code on failure.
  415  *
  416  * Timestamps:
  417  *      NA
  418  */
  419 int
  420 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
  421     int *direntflags, pathname_t *realpnp)
  422 {
  423         zfsvfs_t *zfsvfs = ZTOZSB(zdp);
  424         int error = 0;
  425 
  426         /*
  427          * Fast path lookup, however we must skip DNLC lookup
  428          * for case folding or normalizing lookups because the
  429          * DNLC code only stores the passed in name.  This means
  430          * creating 'a' and removing 'A' on a case insensitive
  431          * file system would work, but DNLC still thinks 'a'
  432          * exists and won't let you create it again on the next
  433          * pass through fast path.
  434          */
  435         if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
  436 
  437                 if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
  438                         return (SET_ERROR(ENOTDIR));
  439                 } else if (zdp->z_sa_hdl == NULL) {
  440                         return (SET_ERROR(EIO));
  441                 }
  442 
  443                 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
  444                         error = zfs_fastaccesschk_execute(zdp, cr);
  445                         if (!error) {
  446                                 *zpp = zdp;
  447                                 zhold(*zpp);
  448                                 return (0);
  449                         }
  450                         return (error);
  451                 }
  452         }
  453 
  454         if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
  455                 return (error);
  456 
  457         *zpp = NULL;
  458 
  459         if (flags & LOOKUP_XATTR) {
  460                 /*
  461                  * We don't allow recursive attributes..
  462                  * Maybe someday we will.
  463                  */
  464                 if (zdp->z_pflags & ZFS_XATTR) {
  465                         zfs_exit(zfsvfs, FTAG);
  466                         return (SET_ERROR(EINVAL));
  467                 }
  468 
  469                 if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
  470                         zfs_exit(zfsvfs, FTAG);
  471                         return (error);
  472                 }
  473 
  474                 /*
  475                  * Do we have permission to get into attribute directory?
  476                  */
  477 
  478                 if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
  479                     B_TRUE, cr, kcred->user_ns))) {
  480                         zrele(*zpp);
  481                         *zpp = NULL;
  482                 }
  483 
  484                 zfs_exit(zfsvfs, FTAG);
  485                 return (error);
  486         }
  487 
  488         if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
  489                 zfs_exit(zfsvfs, FTAG);
  490                 return (SET_ERROR(ENOTDIR));
  491         }
  492 
  493         /*
  494          * Check accessibility of directory.
  495          */
  496 
  497         if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
  498             kcred->user_ns))) {
  499                 zfs_exit(zfsvfs, FTAG);
  500                 return (error);
  501         }
  502 
  503         if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
  504             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
  505                 zfs_exit(zfsvfs, FTAG);
  506                 return (SET_ERROR(EILSEQ));
  507         }
  508 
  509         error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
  510         if ((error == 0) && (*zpp))
  511                 zfs_znode_update_vfs(*zpp);
  512 
  513         zfs_exit(zfsvfs, FTAG);
  514         return (error);
  515 }
  516 
  517 /*
  518  * Attempt to create a new entry in a directory.  If the entry
  519  * already exists, truncate the file if permissible, else return
  520  * an error.  Return the ip of the created or trunc'd file.
  521  *
  522  *      IN:     dzp     - znode of directory to put new file entry in.
  523  *              name    - name of new file entry.
  524  *              vap     - attributes of new file.
  525  *              excl    - flag indicating exclusive or non-exclusive mode.
  526  *              mode    - mode to open file with.
  527  *              cr      - credentials of caller.
  528  *              flag    - file flag.
  529  *              vsecp   - ACL to be set
  530  *              mnt_ns  - user namespace of the mount
  531  *
  532  *      OUT:    zpp     - znode of created or trunc'd entry.
  533  *
  534  *      RETURN: 0 on success, error code on failure.
  535  *
  536  * Timestamps:
  537  *      dzp - ctime|mtime updated if new entry created
  538  *       zp - ctime|mtime always, atime if new
  539  */
  540 int
  541 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
  542     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
  543     zuserns_t *mnt_ns)
  544 {
  545         znode_t         *zp;
  546         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
  547         zilog_t         *zilog;
  548         objset_t        *os;
  549         zfs_dirlock_t   *dl;
  550         dmu_tx_t        *tx;
  551         int             error;
  552         uid_t           uid;
  553         gid_t           gid;
  554         zfs_acl_ids_t   acl_ids;
  555         boolean_t       fuid_dirtied;
  556         boolean_t       have_acl = B_FALSE;
  557         boolean_t       waited = B_FALSE;
  558         boolean_t       skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
  559 
  560         /*
  561          * If we have an ephemeral id, ACL, or XVATTR then
  562          * make sure file system is at proper version
  563          */
  564 
  565         gid = crgetgid(cr);
  566         uid = crgetuid(cr);
  567 
  568         if (zfsvfs->z_use_fuids == B_FALSE &&
  569             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
  570                 return (SET_ERROR(EINVAL));
  571 
  572         if (name == NULL)
  573                 return (SET_ERROR(EINVAL));
  574 
  575         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
  576                 return (error);
  577         os = zfsvfs->z_os;
  578         zilog = zfsvfs->z_log;
  579 
  580         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
  581             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
  582                 zfs_exit(zfsvfs, FTAG);
  583                 return (SET_ERROR(EILSEQ));
  584         }
  585 
  586         if (vap->va_mask & ATTR_XVATTR) {
  587                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
  588                     crgetuid(cr), cr, vap->va_mode)) != 0) {
  589                         zfs_exit(zfsvfs, FTAG);
  590                         return (error);
  591                 }
  592         }
  593 
  594 top:
  595         *zpp = NULL;
  596         if (*name == '\0') {
  597                 /*
  598                  * Null component name refers to the directory itself.
  599                  */
  600                 zhold(dzp);
  601                 zp = dzp;
  602                 dl = NULL;
  603                 error = 0;
  604         } else {
  605                 /* possible igrab(zp) */
  606                 int zflg = 0;
  607 
  608                 if (flag & FIGNORECASE)
  609                         zflg |= ZCILOOK;
  610 
  611                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
  612                     NULL, NULL);
  613                 if (error) {
  614                         if (have_acl)
  615                                 zfs_acl_ids_free(&acl_ids);
  616                         if (strcmp(name, "..") == 0)
  617                                 error = SET_ERROR(EISDIR);
  618                         zfs_exit(zfsvfs, FTAG);
  619                         return (error);
  620                 }
  621         }
  622 
  623         if (zp == NULL) {
  624                 uint64_t txtype;
  625                 uint64_t projid = ZFS_DEFAULT_PROJID;
  626 
  627                 /*
  628                  * Create a new file object and update the directory
  629                  * to reference it.
  630                  */
  631                 if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
  632                     mnt_ns))) {
  633                         if (have_acl)
  634                                 zfs_acl_ids_free(&acl_ids);
  635                         goto out;
  636                 }
  637 
  638                 /*
  639                  * We only support the creation of regular files in
  640                  * extended attribute directories.
  641                  */
  642 
  643                 if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
  644                         if (have_acl)
  645                                 zfs_acl_ids_free(&acl_ids);
  646                         error = SET_ERROR(EINVAL);
  647                         goto out;
  648                 }
  649 
  650                 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
  651                     cr, vsecp, &acl_ids, mnt_ns)) != 0)
  652                         goto out;
  653                 have_acl = B_TRUE;
  654 
  655                 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
  656                         projid = zfs_inherit_projid(dzp);
  657                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
  658                         zfs_acl_ids_free(&acl_ids);
  659                         error = SET_ERROR(EDQUOT);
  660                         goto out;
  661                 }
  662 
  663                 tx = dmu_tx_create(os);
  664 
  665                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
  666                     ZFS_SA_BASE_ATTR_SIZE);
  667 
  668                 fuid_dirtied = zfsvfs->z_fuid_dirty;
  669                 if (fuid_dirtied)
  670                         zfs_fuid_txhold(zfsvfs, tx);
  671                 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
  672                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
  673                 if (!zfsvfs->z_use_sa &&
  674                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
  675                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
  676                             0, acl_ids.z_aclp->z_acl_bytes);
  677                 }
  678 
  679                 error = dmu_tx_assign(tx,
  680                     (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  681                 if (error) {
  682                         zfs_dirent_unlock(dl);
  683                         if (error == ERESTART) {
  684                                 waited = B_TRUE;
  685                                 dmu_tx_wait(tx);
  686                                 dmu_tx_abort(tx);
  687                                 goto top;
  688                         }
  689                         zfs_acl_ids_free(&acl_ids);
  690                         dmu_tx_abort(tx);
  691                         zfs_exit(zfsvfs, FTAG);
  692                         return (error);
  693                 }
  694                 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
  695 
  696                 error = zfs_link_create(dl, zp, tx, ZNEW);
  697                 if (error != 0) {
  698                         /*
  699                          * Since, we failed to add the directory entry for it,
  700                          * delete the newly created dnode.
  701                          */
  702                         zfs_znode_delete(zp, tx);
  703                         remove_inode_hash(ZTOI(zp));
  704                         zfs_acl_ids_free(&acl_ids);
  705                         dmu_tx_commit(tx);
  706                         goto out;
  707                 }
  708 
  709                 if (fuid_dirtied)
  710                         zfs_fuid_sync(zfsvfs, tx);
  711 
  712                 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
  713                 if (flag & FIGNORECASE)
  714                         txtype |= TX_CI;
  715                 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
  716                     vsecp, acl_ids.z_fuidp, vap);
  717                 zfs_acl_ids_free(&acl_ids);
  718                 dmu_tx_commit(tx);
  719         } else {
  720                 int aflags = (flag & O_APPEND) ? V_APPEND : 0;
  721 
  722                 if (have_acl)
  723                         zfs_acl_ids_free(&acl_ids);
  724 
  725                 /*
  726                  * A directory entry already exists for this name.
  727                  */
  728                 /*
  729                  * Can't truncate an existing file if in exclusive mode.
  730                  */
  731                 if (excl) {
  732                         error = SET_ERROR(EEXIST);
  733                         goto out;
  734                 }
  735                 /*
  736                  * Can't open a directory for writing.
  737                  */
  738                 if (S_ISDIR(ZTOI(zp)->i_mode)) {
  739                         error = SET_ERROR(EISDIR);
  740                         goto out;
  741                 }
  742                 /*
  743                  * Verify requested access to file.
  744                  */
  745                 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
  746                     mnt_ns))) {
  747                         goto out;
  748                 }
  749 
  750                 mutex_enter(&dzp->z_lock);
  751                 dzp->z_seq++;
  752                 mutex_exit(&dzp->z_lock);
  753 
  754                 /*
  755                  * Truncate regular files if requested.
  756                  */
  757                 if (S_ISREG(ZTOI(zp)->i_mode) &&
  758                     (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
  759                         /* we can't hold any locks when calling zfs_freesp() */
  760                         if (dl) {
  761                                 zfs_dirent_unlock(dl);
  762                                 dl = NULL;
  763                         }
  764                         error = zfs_freesp(zp, 0, 0, mode, TRUE);
  765                 }
  766         }
  767 out:
  768 
  769         if (dl)
  770                 zfs_dirent_unlock(dl);
  771 
  772         if (error) {
  773                 if (zp)
  774                         zrele(zp);
  775         } else {
  776                 zfs_znode_update_vfs(dzp);
  777                 zfs_znode_update_vfs(zp);
  778                 *zpp = zp;
  779         }
  780 
  781         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
  782                 zil_commit(zilog, 0);
  783 
  784         zfs_exit(zfsvfs, FTAG);
  785         return (error);
  786 }
  787 
  788 int
  789 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
  790     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
  791     zuserns_t *mnt_ns)
  792 {
  793         (void) excl, (void) mode, (void) flag;
  794         znode_t         *zp = NULL, *dzp = ITOZ(dip);
  795         zfsvfs_t        *zfsvfs = ITOZSB(dip);
  796         objset_t        *os;
  797         dmu_tx_t        *tx;
  798         int             error;
  799         uid_t           uid;
  800         gid_t           gid;
  801         zfs_acl_ids_t   acl_ids;
  802         uint64_t        projid = ZFS_DEFAULT_PROJID;
  803         boolean_t       fuid_dirtied;
  804         boolean_t       have_acl = B_FALSE;
  805         boolean_t       waited = B_FALSE;
  806 
  807         /*
  808          * If we have an ephemeral id, ACL, or XVATTR then
  809          * make sure file system is at proper version
  810          */
  811 
  812         gid = crgetgid(cr);
  813         uid = crgetuid(cr);
  814 
  815         if (zfsvfs->z_use_fuids == B_FALSE &&
  816             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
  817                 return (SET_ERROR(EINVAL));
  818 
  819         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
  820                 return (error);
  821         os = zfsvfs->z_os;
  822 
  823         if (vap->va_mask & ATTR_XVATTR) {
  824                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
  825                     crgetuid(cr), cr, vap->va_mode)) != 0) {
  826                         zfs_exit(zfsvfs, FTAG);
  827                         return (error);
  828                 }
  829         }
  830 
  831 top:
  832         *ipp = NULL;
  833 
  834         /*
  835          * Create a new file object and update the directory
  836          * to reference it.
  837          */
  838         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
  839                 if (have_acl)
  840                         zfs_acl_ids_free(&acl_ids);
  841                 goto out;
  842         }
  843 
  844         if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
  845             cr, vsecp, &acl_ids, mnt_ns)) != 0)
  846                 goto out;
  847         have_acl = B_TRUE;
  848 
  849         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
  850                 projid = zfs_inherit_projid(dzp);
  851         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
  852                 zfs_acl_ids_free(&acl_ids);
  853                 error = SET_ERROR(EDQUOT);
  854                 goto out;
  855         }
  856 
  857         tx = dmu_tx_create(os);
  858 
  859         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
  860             ZFS_SA_BASE_ATTR_SIZE);
  861         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
  862 
  863         fuid_dirtied = zfsvfs->z_fuid_dirty;
  864         if (fuid_dirtied)
  865                 zfs_fuid_txhold(zfsvfs, tx);
  866         if (!zfsvfs->z_use_sa &&
  867             acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
  868                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
  869                     0, acl_ids.z_aclp->z_acl_bytes);
  870         }
  871         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  872         if (error) {
  873                 if (error == ERESTART) {
  874                         waited = B_TRUE;
  875                         dmu_tx_wait(tx);
  876                         dmu_tx_abort(tx);
  877                         goto top;
  878                 }
  879                 zfs_acl_ids_free(&acl_ids);
  880                 dmu_tx_abort(tx);
  881                 zfs_exit(zfsvfs, FTAG);
  882                 return (error);
  883         }
  884         zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
  885 
  886         if (fuid_dirtied)
  887                 zfs_fuid_sync(zfsvfs, tx);
  888 
  889         /* Add to unlinked set */
  890         zp->z_unlinked = B_TRUE;
  891         zfs_unlinked_add(zp, tx);
  892         zfs_acl_ids_free(&acl_ids);
  893         dmu_tx_commit(tx);
  894 out:
  895 
  896         if (error) {
  897                 if (zp)
  898                         zrele(zp);
  899         } else {
  900                 zfs_znode_update_vfs(dzp);
  901                 zfs_znode_update_vfs(zp);
  902                 *ipp = ZTOI(zp);
  903         }
  904 
  905         zfs_exit(zfsvfs, FTAG);
  906         return (error);
  907 }
  908 
  909 /*
  910  * Remove an entry from a directory.
  911  *
  912  *      IN:     dzp     - znode of directory to remove entry from.
  913  *              name    - name of entry to remove.
  914  *              cr      - credentials of caller.
  915  *              flags   - case flags.
  916  *
  917  *      RETURN: 0 if success
  918  *              error code if failure
  919  *
  920  * Timestamps:
  921  *      dzp - ctime|mtime
  922  *       ip - ctime (if nlink > 0)
  923  */
  924 
  925 static uint64_t null_xattr = 0;
  926 
  927 int
  928 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
  929 {
  930         znode_t         *zp;
  931         znode_t         *xzp;
  932         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
  933         zilog_t         *zilog;
  934         uint64_t        acl_obj, xattr_obj;
  935         uint64_t        xattr_obj_unlinked = 0;
  936         uint64_t        obj = 0;
  937         uint64_t        links;
  938         zfs_dirlock_t   *dl;
  939         dmu_tx_t        *tx;
  940         boolean_t       may_delete_now, delete_now = FALSE;
  941         boolean_t       unlinked, toobig = FALSE;
  942         uint64_t        txtype;
  943         pathname_t      *realnmp = NULL;
  944         pathname_t      realnm;
  945         int             error;
  946         int             zflg = ZEXISTS;
  947         boolean_t       waited = B_FALSE;
  948 
  949         if (name == NULL)
  950                 return (SET_ERROR(EINVAL));
  951 
  952         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
  953                 return (error);
  954         zilog = zfsvfs->z_log;
  955 
  956         if (flags & FIGNORECASE) {
  957                 zflg |= ZCILOOK;
  958                 pn_alloc(&realnm);
  959                 realnmp = &realnm;
  960         }
  961 
  962 top:
  963         xattr_obj = 0;
  964         xzp = NULL;
  965         /*
  966          * Attempt to lock directory; fail if entry doesn't exist.
  967          */
  968         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
  969             NULL, realnmp))) {
  970                 if (realnmp)
  971                         pn_free(realnmp);
  972                 zfs_exit(zfsvfs, FTAG);
  973                 return (error);
  974         }
  975 
  976         if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) {
  977                 goto out;
  978         }
  979 
  980         /*
  981          * Need to use rmdir for removing directories.
  982          */
  983         if (S_ISDIR(ZTOI(zp)->i_mode)) {
  984                 error = SET_ERROR(EPERM);
  985                 goto out;
  986         }
  987 
  988         mutex_enter(&zp->z_lock);
  989         may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
  990             !(zp->z_is_mapped);
  991         mutex_exit(&zp->z_lock);
  992 
  993         /*
  994          * We may delete the znode now, or we may put it in the unlinked set;
  995          * it depends on whether we're the last link, and on whether there are
  996          * other holds on the inode.  So we dmu_tx_hold() the right things to
  997          * allow for either case.
  998          */
  999         obj = zp->z_id;
 1000         tx = dmu_tx_create(zfsvfs->z_os);
 1001         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 1002         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 1003         zfs_sa_upgrade_txholds(tx, zp);
 1004         zfs_sa_upgrade_txholds(tx, dzp);
 1005         if (may_delete_now) {
 1006                 toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
 1007                 /* if the file is too big, only hold_free a token amount */
 1008                 dmu_tx_hold_free(tx, zp->z_id, 0,
 1009                     (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 1010         }
 1011 
 1012         /* are there any extended attributes? */
 1013         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 1014             &xattr_obj, sizeof (xattr_obj));
 1015         if (error == 0 && xattr_obj) {
 1016                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 1017                 ASSERT0(error);
 1018                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 1019                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 1020         }
 1021 
 1022         mutex_enter(&zp->z_lock);
 1023         if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 1024                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 1025         mutex_exit(&zp->z_lock);
 1026 
 1027         /* charge as an update -- would be nice not to charge at all */
 1028         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 1029 
 1030         /*
 1031          * Mark this transaction as typically resulting in a net free of space
 1032          */
 1033         dmu_tx_mark_netfree(tx);
 1034 
 1035         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 1036         if (error) {
 1037                 zfs_dirent_unlock(dl);
 1038                 if (error == ERESTART) {
 1039                         waited = B_TRUE;
 1040                         dmu_tx_wait(tx);
 1041                         dmu_tx_abort(tx);
 1042                         zrele(zp);
 1043                         if (xzp)
 1044                                 zrele(xzp);
 1045                         goto top;
 1046                 }
 1047                 if (realnmp)
 1048                         pn_free(realnmp);
 1049                 dmu_tx_abort(tx);
 1050                 zrele(zp);
 1051                 if (xzp)
 1052                         zrele(xzp);
 1053                 zfs_exit(zfsvfs, FTAG);
 1054                 return (error);
 1055         }
 1056 
 1057         /*
 1058          * Remove the directory entry.
 1059          */
 1060         error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 1061 
 1062         if (error) {
 1063                 dmu_tx_commit(tx);
 1064                 goto out;
 1065         }
 1066 
 1067         if (unlinked) {
 1068                 /*
 1069                  * Hold z_lock so that we can make sure that the ACL obj
 1070                  * hasn't changed.  Could have been deleted due to
 1071                  * zfs_sa_upgrade().
 1072                  */
 1073                 mutex_enter(&zp->z_lock);
 1074                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 1075                     &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 1076                 delete_now = may_delete_now && !toobig &&
 1077                     atomic_read(&ZTOI(zp)->i_count) == 1 &&
 1078                     !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
 1079                     zfs_external_acl(zp) == acl_obj;
 1080         }
 1081 
 1082         if (delete_now) {
 1083                 if (xattr_obj_unlinked) {
 1084                         ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
 1085                         mutex_enter(&xzp->z_lock);
 1086                         xzp->z_unlinked = B_TRUE;
 1087                         clear_nlink(ZTOI(xzp));
 1088                         links = 0;
 1089                         error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 1090                             &links, sizeof (links), tx);
 1091                         ASSERT3U(error,  ==,  0);
 1092                         mutex_exit(&xzp->z_lock);
 1093                         zfs_unlinked_add(xzp, tx);
 1094 
 1095                         if (zp->z_is_sa)
 1096                                 error = sa_remove(zp->z_sa_hdl,
 1097                                     SA_ZPL_XATTR(zfsvfs), tx);
 1098                         else
 1099                                 error = sa_update(zp->z_sa_hdl,
 1100                                     SA_ZPL_XATTR(zfsvfs), &null_xattr,
 1101                                     sizeof (uint64_t), tx);
 1102                         ASSERT0(error);
 1103                 }
 1104                 /*
 1105                  * Add to the unlinked set because a new reference could be
 1106                  * taken concurrently resulting in a deferred destruction.
 1107                  */
 1108                 zfs_unlinked_add(zp, tx);
 1109                 mutex_exit(&zp->z_lock);
 1110         } else if (unlinked) {
 1111                 mutex_exit(&zp->z_lock);
 1112                 zfs_unlinked_add(zp, tx);
 1113         }
 1114 
 1115         txtype = TX_REMOVE;
 1116         if (flags & FIGNORECASE)
 1117                 txtype |= TX_CI;
 1118         zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 1119 
 1120         dmu_tx_commit(tx);
 1121 out:
 1122         if (realnmp)
 1123                 pn_free(realnmp);
 1124 
 1125         zfs_dirent_unlock(dl);
 1126         zfs_znode_update_vfs(dzp);
 1127         zfs_znode_update_vfs(zp);
 1128 
 1129         if (delete_now)
 1130                 zrele(zp);
 1131         else
 1132                 zfs_zrele_async(zp);
 1133 
 1134         if (xzp) {
 1135                 zfs_znode_update_vfs(xzp);
 1136                 zfs_zrele_async(xzp);
 1137         }
 1138 
 1139         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 1140                 zil_commit(zilog, 0);
 1141 
 1142         zfs_exit(zfsvfs, FTAG);
 1143         return (error);
 1144 }
 1145 
 1146 /*
 1147  * Create a new directory and insert it into dzp using the name
 1148  * provided.  Return a pointer to the inserted directory.
 1149  *
 1150  *      IN:     dzp     - znode of directory to add subdir to.
 1151  *              dirname - name of new directory.
 1152  *              vap     - attributes of new directory.
 1153  *              cr      - credentials of caller.
 1154  *              flags   - case flags.
 1155  *              vsecp   - ACL to be set
 1156  *              mnt_ns  - user namespace of the mount
 1157  *
 1158  *      OUT:    zpp     - znode of created directory.
 1159  *
 1160  *      RETURN: 0 if success
 1161  *              error code if failure
 1162  *
 1163  * Timestamps:
 1164  *      dzp - ctime|mtime updated
 1165  *      zpp - ctime|mtime|atime updated
 1166  */
 1167 int
 1168 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
 1169     cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns)
 1170 {
 1171         znode_t         *zp;
 1172         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 1173         zilog_t         *zilog;
 1174         zfs_dirlock_t   *dl;
 1175         uint64_t        txtype;
 1176         dmu_tx_t        *tx;
 1177         int             error;
 1178         int             zf = ZNEW;
 1179         uid_t           uid;
 1180         gid_t           gid = crgetgid(cr);
 1181         zfs_acl_ids_t   acl_ids;
 1182         boolean_t       fuid_dirtied;
 1183         boolean_t       waited = B_FALSE;
 1184 
 1185         ASSERT(S_ISDIR(vap->va_mode));
 1186 
 1187         /*
 1188          * If we have an ephemeral id, ACL, or XVATTR then
 1189          * make sure file system is at proper version
 1190          */
 1191 
 1192         uid = crgetuid(cr);
 1193         if (zfsvfs->z_use_fuids == B_FALSE &&
 1194             (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 1195                 return (SET_ERROR(EINVAL));
 1196 
 1197         if (dirname == NULL)
 1198                 return (SET_ERROR(EINVAL));
 1199 
 1200         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 1201                 return (error);
 1202         zilog = zfsvfs->z_log;
 1203 
 1204         if (dzp->z_pflags & ZFS_XATTR) {
 1205                 zfs_exit(zfsvfs, FTAG);
 1206                 return (SET_ERROR(EINVAL));
 1207         }
 1208 
 1209         if (zfsvfs->z_utf8 && u8_validate(dirname,
 1210             strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 1211                 zfs_exit(zfsvfs, FTAG);
 1212                 return (SET_ERROR(EILSEQ));
 1213         }
 1214         if (flags & FIGNORECASE)
 1215                 zf |= ZCILOOK;
 1216 
 1217         if (vap->va_mask & ATTR_XVATTR) {
 1218                 if ((error = secpolicy_xvattr((xvattr_t *)vap,
 1219                     crgetuid(cr), cr, vap->va_mode)) != 0) {
 1220                         zfs_exit(zfsvfs, FTAG);
 1221                         return (error);
 1222                 }
 1223         }
 1224 
 1225         if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 1226             vsecp, &acl_ids, mnt_ns)) != 0) {
 1227                 zfs_exit(zfsvfs, FTAG);
 1228                 return (error);
 1229         }
 1230         /*
 1231          * First make sure the new directory doesn't exist.
 1232          *
 1233          * Existence is checked first to make sure we don't return
 1234          * EACCES instead of EEXIST which can cause some applications
 1235          * to fail.
 1236          */
 1237 top:
 1238         *zpp = NULL;
 1239 
 1240         if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 1241             NULL, NULL))) {
 1242                 zfs_acl_ids_free(&acl_ids);
 1243                 zfs_exit(zfsvfs, FTAG);
 1244                 return (error);
 1245         }
 1246 
 1247         if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 1248             mnt_ns))) {
 1249                 zfs_acl_ids_free(&acl_ids);
 1250                 zfs_dirent_unlock(dl);
 1251                 zfs_exit(zfsvfs, FTAG);
 1252                 return (error);
 1253         }
 1254 
 1255         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
 1256                 zfs_acl_ids_free(&acl_ids);
 1257                 zfs_dirent_unlock(dl);
 1258                 zfs_exit(zfsvfs, FTAG);
 1259                 return (SET_ERROR(EDQUOT));
 1260         }
 1261 
 1262         /*
 1263          * Add a new entry to the directory.
 1264          */
 1265         tx = dmu_tx_create(zfsvfs->z_os);
 1266         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 1267         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 1268         fuid_dirtied = zfsvfs->z_fuid_dirty;
 1269         if (fuid_dirtied)
 1270                 zfs_fuid_txhold(zfsvfs, tx);
 1271         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 1272                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 1273                     acl_ids.z_aclp->z_acl_bytes);
 1274         }
 1275 
 1276         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 1277             ZFS_SA_BASE_ATTR_SIZE);
 1278 
 1279         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 1280         if (error) {
 1281                 zfs_dirent_unlock(dl);
 1282                 if (error == ERESTART) {
 1283                         waited = B_TRUE;
 1284                         dmu_tx_wait(tx);
 1285                         dmu_tx_abort(tx);
 1286                         goto top;
 1287                 }
 1288                 zfs_acl_ids_free(&acl_ids);
 1289                 dmu_tx_abort(tx);
 1290                 zfs_exit(zfsvfs, FTAG);
 1291                 return (error);
 1292         }
 1293 
 1294         /*
 1295          * Create new node.
 1296          */
 1297         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 1298 
 1299         /*
 1300          * Now put new name in parent dir.
 1301          */
 1302         error = zfs_link_create(dl, zp, tx, ZNEW);
 1303         if (error != 0) {
 1304                 zfs_znode_delete(zp, tx);
 1305                 remove_inode_hash(ZTOI(zp));
 1306                 goto out;
 1307         }
 1308 
 1309         if (fuid_dirtied)
 1310                 zfs_fuid_sync(zfsvfs, tx);
 1311 
 1312         *zpp = zp;
 1313 
 1314         txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 1315         if (flags & FIGNORECASE)
 1316                 txtype |= TX_CI;
 1317         zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 1318             acl_ids.z_fuidp, vap);
 1319 
 1320 out:
 1321         zfs_acl_ids_free(&acl_ids);
 1322 
 1323         dmu_tx_commit(tx);
 1324 
 1325         zfs_dirent_unlock(dl);
 1326 
 1327         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 1328                 zil_commit(zilog, 0);
 1329 
 1330         if (error != 0) {
 1331                 zrele(zp);
 1332         } else {
 1333                 zfs_znode_update_vfs(dzp);
 1334                 zfs_znode_update_vfs(zp);
 1335         }
 1336         zfs_exit(zfsvfs, FTAG);
 1337         return (error);
 1338 }
 1339 
 1340 /*
 1341  * Remove a directory subdir entry.  If the current working
 1342  * directory is the same as the subdir to be removed, the
 1343  * remove will fail.
 1344  *
 1345  *      IN:     dzp     - znode of directory to remove from.
 1346  *              name    - name of directory to be removed.
 1347  *              cwd     - inode of current working directory.
 1348  *              cr      - credentials of caller.
 1349  *              flags   - case flags
 1350  *
 1351  *      RETURN: 0 on success, error code on failure.
 1352  *
 1353  * Timestamps:
 1354  *      dzp - ctime|mtime updated
 1355  */
 1356 int
 1357 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
 1358     int flags)
 1359 {
 1360         znode_t         *zp;
 1361         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 1362         zilog_t         *zilog;
 1363         zfs_dirlock_t   *dl;
 1364         dmu_tx_t        *tx;
 1365         int             error;
 1366         int             zflg = ZEXISTS;
 1367         boolean_t       waited = B_FALSE;
 1368 
 1369         if (name == NULL)
 1370                 return (SET_ERROR(EINVAL));
 1371 
 1372         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 1373                 return (error);
 1374         zilog = zfsvfs->z_log;
 1375 
 1376         if (flags & FIGNORECASE)
 1377                 zflg |= ZCILOOK;
 1378 top:
 1379         zp = NULL;
 1380 
 1381         /*
 1382          * Attempt to lock directory; fail if entry doesn't exist.
 1383          */
 1384         if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 1385             NULL, NULL))) {
 1386                 zfs_exit(zfsvfs, FTAG);
 1387                 return (error);
 1388         }
 1389 
 1390         if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) {
 1391                 goto out;
 1392         }
 1393 
 1394         if (!S_ISDIR(ZTOI(zp)->i_mode)) {
 1395                 error = SET_ERROR(ENOTDIR);
 1396                 goto out;
 1397         }
 1398 
 1399         if (zp == cwd) {
 1400                 error = SET_ERROR(EINVAL);
 1401                 goto out;
 1402         }
 1403 
 1404         /*
 1405          * Grab a lock on the directory to make sure that no one is
 1406          * trying to add (or lookup) entries while we are removing it.
 1407          */
 1408         rw_enter(&zp->z_name_lock, RW_WRITER);
 1409 
 1410         /*
 1411          * Grab a lock on the parent pointer to make sure we play well
 1412          * with the treewalk and directory rename code.
 1413          */
 1414         rw_enter(&zp->z_parent_lock, RW_WRITER);
 1415 
 1416         tx = dmu_tx_create(zfsvfs->z_os);
 1417         dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 1418         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 1419         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 1420         zfs_sa_upgrade_txholds(tx, zp);
 1421         zfs_sa_upgrade_txholds(tx, dzp);
 1422         dmu_tx_mark_netfree(tx);
 1423         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 1424         if (error) {
 1425                 rw_exit(&zp->z_parent_lock);
 1426                 rw_exit(&zp->z_name_lock);
 1427                 zfs_dirent_unlock(dl);
 1428                 if (error == ERESTART) {
 1429                         waited = B_TRUE;
 1430                         dmu_tx_wait(tx);
 1431                         dmu_tx_abort(tx);
 1432                         zrele(zp);
 1433                         goto top;
 1434                 }
 1435                 dmu_tx_abort(tx);
 1436                 zrele(zp);
 1437                 zfs_exit(zfsvfs, FTAG);
 1438                 return (error);
 1439         }
 1440 
 1441         error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 1442 
 1443         if (error == 0) {
 1444                 uint64_t txtype = TX_RMDIR;
 1445                 if (flags & FIGNORECASE)
 1446                         txtype |= TX_CI;
 1447                 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
 1448                     B_FALSE);
 1449         }
 1450 
 1451         dmu_tx_commit(tx);
 1452 
 1453         rw_exit(&zp->z_parent_lock);
 1454         rw_exit(&zp->z_name_lock);
 1455 out:
 1456         zfs_dirent_unlock(dl);
 1457 
 1458         zfs_znode_update_vfs(dzp);
 1459         zfs_znode_update_vfs(zp);
 1460         zrele(zp);
 1461 
 1462         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 1463                 zil_commit(zilog, 0);
 1464 
 1465         zfs_exit(zfsvfs, FTAG);
 1466         return (error);
 1467 }
 1468 
 1469 /*
 1470  * Read directory entries from the given directory cursor position and emit
 1471  * name and position for each entry.
 1472  *
 1473  *      IN:     ip      - inode of directory to read.
 1474  *              ctx     - directory entry context.
 1475  *              cr      - credentials of caller.
 1476  *
 1477  *      RETURN: 0 if success
 1478  *              error code if failure
 1479  *
 1480  * Timestamps:
 1481  *      ip - atime updated
 1482  *
 1483  * Note that the low 4 bits of the cookie returned by zap is always zero.
 1484  * This allows us to use the low range for "special" directory entries:
 1485  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
 1486  * we use the offset 2 for the '.zfs' directory.
 1487  */
 1488 int
 1489 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 1490 {
 1491         (void) cr;
 1492         znode_t         *zp = ITOZ(ip);
 1493         zfsvfs_t        *zfsvfs = ITOZSB(ip);
 1494         objset_t        *os;
 1495         zap_cursor_t    zc;
 1496         zap_attribute_t zap;
 1497         int             error;
 1498         uint8_t         prefetch;
 1499         uint8_t         type;
 1500         int             done = 0;
 1501         uint64_t        parent;
 1502         uint64_t        offset; /* must be unsigned; checks for < 1 */
 1503 
 1504         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 1505                 return (error);
 1506 
 1507         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 1508             &parent, sizeof (parent))) != 0)
 1509                 goto out;
 1510 
 1511         /*
 1512          * Quit if directory has been removed (posix)
 1513          */
 1514         if (zp->z_unlinked)
 1515                 goto out;
 1516 
 1517         error = 0;
 1518         os = zfsvfs->z_os;
 1519         offset = ctx->pos;
 1520         prefetch = zp->z_zn_prefetch;
 1521 
 1522         /*
 1523          * Initialize the iterator cursor.
 1524          */
 1525         if (offset <= 3) {
 1526                 /*
 1527                  * Start iteration from the beginning of the directory.
 1528                  */
 1529                 zap_cursor_init(&zc, os, zp->z_id);
 1530         } else {
 1531                 /*
 1532                  * The offset is a serialized cursor.
 1533                  */
 1534                 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 1535         }
 1536 
 1537         /*
 1538          * Transform to file-system independent format
 1539          */
 1540         while (!done) {
 1541                 uint64_t objnum;
 1542                 /*
 1543                  * Special case `.', `..', and `.zfs'.
 1544                  */
 1545                 if (offset == 0) {
 1546                         (void) strcpy(zap.za_name, ".");
 1547                         zap.za_normalization_conflict = 0;
 1548                         objnum = zp->z_id;
 1549                         type = DT_DIR;
 1550                 } else if (offset == 1) {
 1551                         (void) strcpy(zap.za_name, "..");
 1552                         zap.za_normalization_conflict = 0;
 1553                         objnum = parent;
 1554                         type = DT_DIR;
 1555                 } else if (offset == 2 && zfs_show_ctldir(zp)) {
 1556                         (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 1557                         zap.za_normalization_conflict = 0;
 1558                         objnum = ZFSCTL_INO_ROOT;
 1559                         type = DT_DIR;
 1560                 } else {
 1561                         /*
 1562                          * Grab next entry.
 1563                          */
 1564                         if ((error = zap_cursor_retrieve(&zc, &zap))) {
 1565                                 if (error == ENOENT)
 1566                                         break;
 1567                                 else
 1568                                         goto update;
 1569                         }
 1570 
 1571                         /*
 1572                          * Allow multiple entries provided the first entry is
 1573                          * the object id.  Non-zpl consumers may safely make
 1574                          * use of the additional space.
 1575                          *
 1576                          * XXX: This should be a feature flag for compatibility
 1577                          */
 1578                         if (zap.za_integer_length != 8 ||
 1579                             zap.za_num_integers == 0) {
 1580                                 cmn_err(CE_WARN, "zap_readdir: bad directory "
 1581                                     "entry, obj = %lld, offset = %lld, "
 1582                                     "length = %d, num = %lld\n",
 1583                                     (u_longlong_t)zp->z_id,
 1584                                     (u_longlong_t)offset,
 1585                                     zap.za_integer_length,
 1586                                     (u_longlong_t)zap.za_num_integers);
 1587                                 error = SET_ERROR(ENXIO);
 1588                                 goto update;
 1589                         }
 1590 
 1591                         objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 1592                         type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 1593                 }
 1594 
 1595                 done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
 1596                     objnum, type);
 1597                 if (done)
 1598                         break;
 1599 
 1600                 /* Prefetch znode */
 1601                 if (prefetch) {
 1602                         dmu_prefetch(os, objnum, 0, 0, 0,
 1603                             ZIO_PRIORITY_SYNC_READ);
 1604                 }
 1605 
 1606                 /*
 1607                  * Move to the next entry, fill in the previous offset.
 1608                  */
 1609                 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 1610                         zap_cursor_advance(&zc);
 1611                         offset = zap_cursor_serialize(&zc);
 1612                 } else {
 1613                         offset += 1;
 1614                 }
 1615                 ctx->pos = offset;
 1616         }
 1617         zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 1618 
 1619 update:
 1620         zap_cursor_fini(&zc);
 1621         if (error == ENOENT)
 1622                 error = 0;
 1623 out:
 1624         zfs_exit(zfsvfs, FTAG);
 1625 
 1626         return (error);
 1627 }
 1628 
 1629 /*
 1630  * Get the basic file attributes and place them in the provided kstat
 1631  * structure.  The inode is assumed to be the authoritative source
 1632  * for most of the attributes.  However, the znode currently has the
 1633  * authoritative atime, blksize, and block count.
 1634  *
 1635  *      IN:     ip      - inode of file.
 1636  *
 1637  *      OUT:    sp      - kstat values.
 1638  *
 1639  *      RETURN: 0 (always succeeds)
 1640  */
 1641 int
 1642 zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip,
 1643     struct kstat *sp)
 1644 {
 1645         znode_t *zp = ITOZ(ip);
 1646         zfsvfs_t *zfsvfs = ITOZSB(ip);
 1647         uint32_t blksize;
 1648         u_longlong_t nblocks;
 1649         int error;
 1650 
 1651         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 1652                 return (error);
 1653 
 1654         mutex_enter(&zp->z_lock);
 1655 
 1656         zpl_generic_fillattr(user_ns, ip, sp);
 1657         /*
 1658          * +1 link count for root inode with visible '.zfs' directory.
 1659          */
 1660         if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
 1661                 if (sp->nlink < ZFS_LINK_MAX)
 1662                         sp->nlink++;
 1663 
 1664         sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 1665         sp->blksize = blksize;
 1666         sp->blocks = nblocks;
 1667 
 1668         if (unlikely(zp->z_blksz == 0)) {
 1669                 /*
 1670                  * Block size hasn't been set; suggest maximal I/O transfers.
 1671                  */
 1672                 sp->blksize = zfsvfs->z_max_blksz;
 1673         }
 1674 
 1675         mutex_exit(&zp->z_lock);
 1676 
 1677         /*
 1678          * Required to prevent NFS client from detecting different inode
 1679          * numbers of snapshot root dentry before and after snapshot mount.
 1680          */
 1681         if (zfsvfs->z_issnap) {
 1682                 if (ip->i_sb->s_root->d_inode == ip)
 1683                         sp->ino = ZFSCTL_INO_SNAPDIRS -
 1684                             dmu_objset_id(zfsvfs->z_os);
 1685         }
 1686 
 1687         zfs_exit(zfsvfs, FTAG);
 1688 
 1689         return (0);
 1690 }
 1691 
 1692 /*
 1693  * For the operation of changing file's user/group/project, we need to
 1694  * handle not only the main object that is assigned to the file directly,
 1695  * but also the ones that are used by the file via hidden xattr directory.
 1696  *
 1697  * Because the xattr directory may contains many EA entries, as to it may
 1698  * be impossible to change all of them via the transaction of changing the
 1699  * main object's user/group/project attributes. Then we have to change them
 1700  * via other multiple independent transactions one by one. It may be not good
 1701  * solution, but we have no better idea yet.
 1702  */
 1703 static int
 1704 zfs_setattr_dir(znode_t *dzp)
 1705 {
 1706         struct inode    *dxip = ZTOI(dzp);
 1707         struct inode    *xip = NULL;
 1708         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 1709         objset_t        *os = zfsvfs->z_os;
 1710         zap_cursor_t    zc;
 1711         zap_attribute_t zap;
 1712         zfs_dirlock_t   *dl;
 1713         znode_t         *zp = NULL;
 1714         dmu_tx_t        *tx = NULL;
 1715         uint64_t        uid, gid;
 1716         sa_bulk_attr_t  bulk[4];
 1717         int             count;
 1718         int             err;
 1719 
 1720         zap_cursor_init(&zc, os, dzp->z_id);
 1721         while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
 1722                 count = 0;
 1723                 if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
 1724                         err = ENXIO;
 1725                         break;
 1726                 }
 1727 
 1728                 err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
 1729                     ZEXISTS, NULL, NULL);
 1730                 if (err == ENOENT)
 1731                         goto next;
 1732                 if (err)
 1733                         break;
 1734 
 1735                 xip = ZTOI(zp);
 1736                 if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
 1737                     KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
 1738                     zp->z_projid == dzp->z_projid)
 1739                         goto next;
 1740 
 1741                 tx = dmu_tx_create(os);
 1742                 if (!(zp->z_pflags & ZFS_PROJID))
 1743                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 1744                 else
 1745                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 1746 
 1747                 err = dmu_tx_assign(tx, TXG_WAIT);
 1748                 if (err)
 1749                         break;
 1750 
 1751                 mutex_enter(&dzp->z_lock);
 1752 
 1753                 if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
 1754                         xip->i_uid = dxip->i_uid;
 1755                         uid = zfs_uid_read(dxip);
 1756                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 1757                             &uid, sizeof (uid));
 1758                 }
 1759 
 1760                 if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
 1761                         xip->i_gid = dxip->i_gid;
 1762                         gid = zfs_gid_read(dxip);
 1763                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 1764                             &gid, sizeof (gid));
 1765                 }
 1766 
 1767                 if (zp->z_projid != dzp->z_projid) {
 1768                         if (!(zp->z_pflags & ZFS_PROJID)) {
 1769                                 zp->z_pflags |= ZFS_PROJID;
 1770                                 SA_ADD_BULK_ATTR(bulk, count,
 1771                                     SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
 1772                                     sizeof (zp->z_pflags));
 1773                         }
 1774 
 1775                         zp->z_projid = dzp->z_projid;
 1776                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
 1777                             NULL, &zp->z_projid, sizeof (zp->z_projid));
 1778                 }
 1779 
 1780                 mutex_exit(&dzp->z_lock);
 1781 
 1782                 if (likely(count > 0)) {
 1783                         err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 1784                         dmu_tx_commit(tx);
 1785                 } else {
 1786                         dmu_tx_abort(tx);
 1787                 }
 1788                 tx = NULL;
 1789                 if (err != 0 && err != ENOENT)
 1790                         break;
 1791 
 1792 next:
 1793                 if (zp) {
 1794                         zrele(zp);
 1795                         zp = NULL;
 1796                         zfs_dirent_unlock(dl);
 1797                 }
 1798                 zap_cursor_advance(&zc);
 1799         }
 1800 
 1801         if (tx)
 1802                 dmu_tx_abort(tx);
 1803         if (zp) {
 1804                 zrele(zp);
 1805                 zfs_dirent_unlock(dl);
 1806         }
 1807         zap_cursor_fini(&zc);
 1808 
 1809         return (err == ENOENT ? 0 : err);
 1810 }
 1811 
 1812 /*
 1813  * Set the file attributes to the values contained in the
 1814  * vattr structure.
 1815  *
 1816  *      IN:     zp      - znode of file to be modified.
 1817  *              vap     - new attribute values.
 1818  *                        If ATTR_XVATTR set, then optional attrs are being set
 1819  *              flags   - ATTR_UTIME set if non-default time values provided.
 1820  *                      - ATTR_NOACLCHECK (CIFS context only).
 1821  *              cr      - credentials of caller.
 1822  *              mnt_ns  - user namespace of the mount
 1823  *
 1824  *      RETURN: 0 if success
 1825  *              error code if failure
 1826  *
 1827  * Timestamps:
 1828  *      ip - ctime updated, mtime updated if size changed.
 1829  */
 1830 int
 1831 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns)
 1832 {
 1833         struct inode    *ip;
 1834         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
 1835         objset_t        *os = zfsvfs->z_os;
 1836         zilog_t         *zilog;
 1837         dmu_tx_t        *tx;
 1838         vattr_t         oldva;
 1839         xvattr_t        *tmpxvattr;
 1840         uint_t          mask = vap->va_mask;
 1841         uint_t          saved_mask = 0;
 1842         int             trim_mask = 0;
 1843         uint64_t        new_mode;
 1844         uint64_t        new_kuid = 0, new_kgid = 0, new_uid, new_gid;
 1845         uint64_t        xattr_obj;
 1846         uint64_t        mtime[2], ctime[2], atime[2];
 1847         uint64_t        projid = ZFS_INVALID_PROJID;
 1848         znode_t         *attrzp;
 1849         int             need_policy = FALSE;
 1850         int             err, err2 = 0;
 1851         zfs_fuid_info_t *fuidp = NULL;
 1852         xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
 1853         xoptattr_t      *xoap;
 1854         zfs_acl_t       *aclp;
 1855         boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 1856         boolean_t       fuid_dirtied = B_FALSE;
 1857         boolean_t       handle_eadir = B_FALSE;
 1858         sa_bulk_attr_t  *bulk, *xattr_bulk;
 1859         int             count = 0, xattr_count = 0, bulks = 8;
 1860 
 1861         if (mask == 0)
 1862                 return (0);
 1863 
 1864         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 1865                 return (err);
 1866         ip = ZTOI(zp);
 1867 
 1868         /*
 1869          * If this is a xvattr_t, then get a pointer to the structure of
 1870          * optional attributes.  If this is NULL, then we have a vattr_t.
 1871          */
 1872         xoap = xva_getxoptattr(xvap);
 1873         if (xoap != NULL && (mask & ATTR_XVATTR)) {
 1874                 if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 1875                         if (!dmu_objset_projectquota_enabled(os) ||
 1876                             (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
 1877                                 zfs_exit(zfsvfs, FTAG);
 1878                                 return (SET_ERROR(ENOTSUP));
 1879                         }
 1880 
 1881                         projid = xoap->xoa_projid;
 1882                         if (unlikely(projid == ZFS_INVALID_PROJID)) {
 1883                                 zfs_exit(zfsvfs, FTAG);
 1884                                 return (SET_ERROR(EINVAL));
 1885                         }
 1886 
 1887                         if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
 1888                                 projid = ZFS_INVALID_PROJID;
 1889                         else
 1890                                 need_policy = TRUE;
 1891                 }
 1892 
 1893                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
 1894                     (xoap->xoa_projinherit !=
 1895                     ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
 1896                     (!dmu_objset_projectquota_enabled(os) ||
 1897                     (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
 1898                         zfs_exit(zfsvfs, FTAG);
 1899                         return (SET_ERROR(ENOTSUP));
 1900                 }
 1901         }
 1902 
 1903         zilog = zfsvfs->z_log;
 1904 
 1905         /*
 1906          * Make sure that if we have ephemeral uid/gid or xvattr specified
 1907          * that file system is at proper version level
 1908          */
 1909 
 1910         if (zfsvfs->z_use_fuids == B_FALSE &&
 1911             (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 1912             ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 1913             (mask & ATTR_XVATTR))) {
 1914                 zfs_exit(zfsvfs, FTAG);
 1915                 return (SET_ERROR(EINVAL));
 1916         }
 1917 
 1918         if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
 1919                 zfs_exit(zfsvfs, FTAG);
 1920                 return (SET_ERROR(EISDIR));
 1921         }
 1922 
 1923         if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
 1924                 zfs_exit(zfsvfs, FTAG);
 1925                 return (SET_ERROR(EINVAL));
 1926         }
 1927 
 1928         tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
 1929         xva_init(tmpxvattr);
 1930 
 1931         bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 1932         xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 1933 
 1934         /*
 1935          * Immutable files can only alter immutable bit and atime
 1936          */
 1937         if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 1938             ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
 1939             ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 1940                 err = SET_ERROR(EPERM);
 1941                 goto out3;
 1942         }
 1943 
 1944         if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 1945                 err = SET_ERROR(EPERM);
 1946                 goto out3;
 1947         }
 1948 
 1949         /*
 1950          * Verify timestamps doesn't overflow 32 bits.
 1951          * ZFS can handle large timestamps, but 32bit syscalls can't
 1952          * handle times greater than 2039.  This check should be removed
 1953          * once large timestamps are fully supported.
 1954          */
 1955         if (mask & (ATTR_ATIME | ATTR_MTIME)) {
 1956                 if (((mask & ATTR_ATIME) &&
 1957                     TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 1958                     ((mask & ATTR_MTIME) &&
 1959                     TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 1960                         err = SET_ERROR(EOVERFLOW);
 1961                         goto out3;
 1962                 }
 1963         }
 1964 
 1965 top:
 1966         attrzp = NULL;
 1967         aclp = NULL;
 1968 
 1969         /* Can this be moved to before the top label? */
 1970         if (zfs_is_readonly(zfsvfs)) {
 1971                 err = SET_ERROR(EROFS);
 1972                 goto out3;
 1973         }
 1974 
 1975         /*
 1976          * First validate permissions
 1977          */
 1978 
 1979         if (mask & ATTR_SIZE) {
 1980                 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
 1981                     mnt_ns);
 1982                 if (err)
 1983                         goto out3;
 1984 
 1985                 /*
 1986                  * XXX - Note, we are not providing any open
 1987                  * mode flags here (like FNDELAY), so we may
 1988                  * block if there are locks present... this
 1989                  * should be addressed in openat().
 1990                  */
 1991                 /* XXX - would it be OK to generate a log record here? */
 1992                 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 1993                 if (err)
 1994                         goto out3;
 1995         }
 1996 
 1997         if (mask & (ATTR_ATIME|ATTR_MTIME) ||
 1998             ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 1999             XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 2000             XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 2001             XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 2002             XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 2003             XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 2004             XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 2005                 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 2006                     skipaclchk, cr, mnt_ns);
 2007         }
 2008 
 2009         if (mask & (ATTR_UID|ATTR_GID)) {
 2010                 int     idmask = (mask & (ATTR_UID|ATTR_GID));
 2011                 int     take_owner;
 2012                 int     take_group;
 2013                 uid_t   uid;
 2014                 gid_t   gid;
 2015 
 2016                 /*
 2017                  * NOTE: even if a new mode is being set,
 2018                  * we may clear S_ISUID/S_ISGID bits.
 2019                  */
 2020 
 2021                 if (!(mask & ATTR_MODE))
 2022                         vap->va_mode = zp->z_mode;
 2023 
 2024                 /*
 2025                  * Take ownership or chgrp to group we are a member of
 2026                  */
 2027 
 2028                 uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns,
 2029                     zfs_i_user_ns(ip), vap->va_uid);
 2030                 gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns,
 2031                     zfs_i_user_ns(ip), vap->va_gid);
 2032                 take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
 2033                 take_group = (mask & ATTR_GID) &&
 2034                     zfs_groupmember(zfsvfs, gid, cr);
 2035 
 2036                 /*
 2037                  * If both ATTR_UID and ATTR_GID are set then take_owner and
 2038                  * take_group must both be set in order to allow taking
 2039                  * ownership.
 2040                  *
 2041                  * Otherwise, send the check through secpolicy_vnode_setattr()
 2042                  *
 2043                  */
 2044 
 2045                 if (((idmask == (ATTR_UID|ATTR_GID)) &&
 2046                     take_owner && take_group) ||
 2047                     ((idmask == ATTR_UID) && take_owner) ||
 2048                     ((idmask == ATTR_GID) && take_group)) {
 2049                         if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 2050                             skipaclchk, cr, mnt_ns) == 0) {
 2051                                 /*
 2052                                  * Remove setuid/setgid for non-privileged users
 2053                                  */
 2054                                 (void) secpolicy_setid_clear(vap, cr);
 2055                                 trim_mask = (mask & (ATTR_UID|ATTR_GID));
 2056                         } else {
 2057                                 need_policy =  TRUE;
 2058                         }
 2059                 } else {
 2060                         need_policy =  TRUE;
 2061                 }
 2062         }
 2063 
 2064         mutex_enter(&zp->z_lock);
 2065         oldva.va_mode = zp->z_mode;
 2066         zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 2067         if (mask & ATTR_XVATTR) {
 2068                 /*
 2069                  * Update xvattr mask to include only those attributes
 2070                  * that are actually changing.
 2071                  *
 2072                  * the bits will be restored prior to actually setting
 2073                  * the attributes so the caller thinks they were set.
 2074                  */
 2075                 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 2076                         if (xoap->xoa_appendonly !=
 2077                             ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 2078                                 need_policy = TRUE;
 2079                         } else {
 2080                                 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 2081                                 XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
 2082                         }
 2083                 }
 2084 
 2085                 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 2086                         if (xoap->xoa_projinherit !=
 2087                             ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
 2088                                 need_policy = TRUE;
 2089                         } else {
 2090                                 XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
 2091                                 XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
 2092                         }
 2093                 }
 2094 
 2095                 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 2096                         if (xoap->xoa_nounlink !=
 2097                             ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 2098                                 need_policy = TRUE;
 2099                         } else {
 2100                                 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 2101                                 XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
 2102                         }
 2103                 }
 2104 
 2105                 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 2106                         if (xoap->xoa_immutable !=
 2107                             ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 2108                                 need_policy = TRUE;
 2109                         } else {
 2110                                 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 2111                                 XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
 2112                         }
 2113                 }
 2114 
 2115                 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 2116                         if (xoap->xoa_nodump !=
 2117                             ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 2118                                 need_policy = TRUE;
 2119                         } else {
 2120                                 XVA_CLR_REQ(xvap, XAT_NODUMP);
 2121                                 XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
 2122                         }
 2123                 }
 2124 
 2125                 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 2126                         if (xoap->xoa_av_modified !=
 2127                             ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 2128                                 need_policy = TRUE;
 2129                         } else {
 2130                                 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 2131                                 XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
 2132                         }
 2133                 }
 2134 
 2135                 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 2136                         if ((!S_ISREG(ip->i_mode) &&
 2137                             xoap->xoa_av_quarantined) ||
 2138                             xoap->xoa_av_quarantined !=
 2139                             ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 2140                                 need_policy = TRUE;
 2141                         } else {
 2142                                 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 2143                                 XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
 2144                         }
 2145                 }
 2146 
 2147                 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 2148                         mutex_exit(&zp->z_lock);
 2149                         err = SET_ERROR(EPERM);
 2150                         goto out3;
 2151                 }
 2152 
 2153                 if (need_policy == FALSE &&
 2154                     (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 2155                     XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 2156                         need_policy = TRUE;
 2157                 }
 2158         }
 2159 
 2160         mutex_exit(&zp->z_lock);
 2161 
 2162         if (mask & ATTR_MODE) {
 2163                 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
 2164                     mnt_ns) == 0) {
 2165                         err = secpolicy_setid_setsticky_clear(ip, vap,
 2166                             &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
 2167                         if (err)
 2168                                 goto out3;
 2169                         trim_mask |= ATTR_MODE;
 2170                 } else {
 2171                         need_policy = TRUE;
 2172                 }
 2173         }
 2174 
 2175         if (need_policy) {
 2176                 /*
 2177                  * If trim_mask is set then take ownership
 2178                  * has been granted or write_acl is present and user
 2179                  * has the ability to modify mode.  In that case remove
 2180                  * UID|GID and or MODE from mask so that
 2181                  * secpolicy_vnode_setattr() doesn't revoke it.
 2182                  */
 2183 
 2184                 if (trim_mask) {
 2185                         saved_mask = vap->va_mask;
 2186                         vap->va_mask &= ~trim_mask;
 2187                 }
 2188                 err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
 2189                     zfs_zaccess_unix, zp);
 2190                 if (err)
 2191                         goto out3;
 2192 
 2193                 if (trim_mask)
 2194                         vap->va_mask |= saved_mask;
 2195         }
 2196 
 2197         /*
 2198          * secpolicy_vnode_setattr, or take ownership may have
 2199          * changed va_mask
 2200          */
 2201         mask = vap->va_mask;
 2202 
 2203         if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
 2204                 handle_eadir = B_TRUE;
 2205                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 2206                     &xattr_obj, sizeof (xattr_obj));
 2207 
 2208                 if (err == 0 && xattr_obj) {
 2209                         err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
 2210                         if (err)
 2211                                 goto out2;
 2212                 }
 2213                 if (mask & ATTR_UID) {
 2214                         new_kuid = zfs_fuid_create(zfsvfs,
 2215                             (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 2216                         if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
 2217                             zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
 2218                             new_kuid)) {
 2219                                 if (attrzp)
 2220                                         zrele(attrzp);
 2221                                 err = SET_ERROR(EDQUOT);
 2222                                 goto out2;
 2223                         }
 2224                 }
 2225 
 2226                 if (mask & ATTR_GID) {
 2227                         new_kgid = zfs_fuid_create(zfsvfs,
 2228                             (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
 2229                         if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
 2230                             zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 2231                             new_kgid)) {
 2232                                 if (attrzp)
 2233                                         zrele(attrzp);
 2234                                 err = SET_ERROR(EDQUOT);
 2235                                 goto out2;
 2236                         }
 2237                 }
 2238 
 2239                 if (projid != ZFS_INVALID_PROJID &&
 2240                     zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
 2241                         if (attrzp)
 2242                                 zrele(attrzp);
 2243                         err = EDQUOT;
 2244                         goto out2;
 2245                 }
 2246         }
 2247         tx = dmu_tx_create(os);
 2248 
 2249         if (mask & ATTR_MODE) {
 2250                 uint64_t pmode = zp->z_mode;
 2251                 uint64_t acl_obj;
 2252                 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 2253 
 2254                 if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
 2255                     !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 2256                         err = EPERM;
 2257                         goto out;
 2258                 }
 2259 
 2260                 if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
 2261                         goto out;
 2262 
 2263                 mutex_enter(&zp->z_lock);
 2264                 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 2265                         /*
 2266                          * Are we upgrading ACL from old V0 format
 2267                          * to V1 format?
 2268                          */
 2269                         if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 2270                             zfs_znode_acl_version(zp) ==
 2271                             ZFS_ACL_VERSION_INITIAL) {
 2272                                 dmu_tx_hold_free(tx, acl_obj, 0,
 2273                                     DMU_OBJECT_END);
 2274                                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 2275                                     0, aclp->z_acl_bytes);
 2276                         } else {
 2277                                 dmu_tx_hold_write(tx, acl_obj, 0,
 2278                                     aclp->z_acl_bytes);
 2279                         }
 2280                 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 2281                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 2282                             0, aclp->z_acl_bytes);
 2283                 }
 2284                 mutex_exit(&zp->z_lock);
 2285                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 2286         } else {
 2287                 if (((mask & ATTR_XVATTR) &&
 2288                     XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
 2289                     (projid != ZFS_INVALID_PROJID &&
 2290                     !(zp->z_pflags & ZFS_PROJID)))
 2291                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 2292                 else
 2293                         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 2294         }
 2295 
 2296         if (attrzp) {
 2297                 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 2298         }
 2299 
 2300         fuid_dirtied = zfsvfs->z_fuid_dirty;
 2301         if (fuid_dirtied)
 2302                 zfs_fuid_txhold(zfsvfs, tx);
 2303 
 2304         zfs_sa_upgrade_txholds(tx, zp);
 2305 
 2306         err = dmu_tx_assign(tx, TXG_WAIT);
 2307         if (err)
 2308                 goto out;
 2309 
 2310         count = 0;
 2311         /*
 2312          * Set each attribute requested.
 2313          * We group settings according to the locks they need to acquire.
 2314          *
 2315          * Note: you cannot set ctime directly, although it will be
 2316          * updated as a side-effect of calling this function.
 2317          */
 2318 
 2319         if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
 2320                 /*
 2321                  * For the existed object that is upgraded from old system,
 2322                  * its on-disk layout has no slot for the project ID attribute.
 2323                  * But quota accounting logic needs to access related slots by
 2324                  * offset directly. So we need to adjust old objects' layout
 2325                  * to make the project ID to some unified and fixed offset.
 2326                  */
 2327                 if (attrzp)
 2328                         err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
 2329                 if (err == 0)
 2330                         err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 2331 
 2332                 if (unlikely(err == EEXIST))
 2333                         err = 0;
 2334                 else if (err != 0)
 2335                         goto out;
 2336                 else
 2337                         projid = ZFS_INVALID_PROJID;
 2338         }
 2339 
 2340         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 2341                 mutex_enter(&zp->z_acl_lock);
 2342         mutex_enter(&zp->z_lock);
 2343 
 2344         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 2345             &zp->z_pflags, sizeof (zp->z_pflags));
 2346 
 2347         if (attrzp) {
 2348                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 2349                         mutex_enter(&attrzp->z_acl_lock);
 2350                 mutex_enter(&attrzp->z_lock);
 2351                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 2352                     SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 2353                     sizeof (attrzp->z_pflags));
 2354                 if (projid != ZFS_INVALID_PROJID) {
 2355                         attrzp->z_projid = projid;
 2356                         SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 2357                             SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
 2358                             sizeof (attrzp->z_projid));
 2359                 }
 2360         }
 2361 
 2362         if (mask & (ATTR_UID|ATTR_GID)) {
 2363 
 2364                 if (mask & ATTR_UID) {
 2365                         ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
 2366                         new_uid = zfs_uid_read(ZTOI(zp));
 2367                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 2368                             &new_uid, sizeof (new_uid));
 2369                         if (attrzp) {
 2370                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 2371                                     SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 2372                                     sizeof (new_uid));
 2373                                 ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
 2374                         }
 2375                 }
 2376 
 2377                 if (mask & ATTR_GID) {
 2378                         ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
 2379                         new_gid = zfs_gid_read(ZTOI(zp));
 2380                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 2381                             NULL, &new_gid, sizeof (new_gid));
 2382                         if (attrzp) {
 2383                                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 2384                                     SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 2385                                     sizeof (new_gid));
 2386                                 ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
 2387                         }
 2388                 }
 2389                 if (!(mask & ATTR_MODE)) {
 2390                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 2391                             NULL, &new_mode, sizeof (new_mode));
 2392                         new_mode = zp->z_mode;
 2393                 }
 2394                 err = zfs_acl_chown_setattr(zp);
 2395                 ASSERT(err == 0);
 2396                 if (attrzp) {
 2397                         err = zfs_acl_chown_setattr(attrzp);
 2398                         ASSERT(err == 0);
 2399                 }
 2400         }
 2401 
 2402         if (mask & ATTR_MODE) {
 2403                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 2404                     &new_mode, sizeof (new_mode));
 2405                 zp->z_mode = ZTOI(zp)->i_mode = new_mode;
 2406                 ASSERT3P(aclp, !=, NULL);
 2407                 err = zfs_aclset_common(zp, aclp, cr, tx);
 2408                 ASSERT0(err);
 2409                 if (zp->z_acl_cached)
 2410                         zfs_acl_free(zp->z_acl_cached);
 2411                 zp->z_acl_cached = aclp;
 2412                 aclp = NULL;
 2413         }
 2414 
 2415         if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
 2416                 zp->z_atime_dirty = B_FALSE;
 2417                 ZFS_TIME_ENCODE(&ip->i_atime, atime);
 2418                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 2419                     &atime, sizeof (atime));
 2420         }
 2421 
 2422         if (mask & (ATTR_MTIME | ATTR_SIZE)) {
 2423                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 2424                 ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
 2425                     vap->va_mtime, ZTOI(zp));
 2426 
 2427                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 2428                     mtime, sizeof (mtime));
 2429         }
 2430 
 2431         if (mask & (ATTR_CTIME | ATTR_SIZE)) {
 2432                 ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
 2433                 ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
 2434                     ZTOI(zp));
 2435                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 2436                     ctime, sizeof (ctime));
 2437         }
 2438 
 2439         if (projid != ZFS_INVALID_PROJID) {
 2440                 zp->z_projid = projid;
 2441                 SA_ADD_BULK_ATTR(bulk, count,
 2442                     SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 2443                     sizeof (zp->z_projid));
 2444         }
 2445 
 2446         if (attrzp && mask) {
 2447                 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 2448                     SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
 2449                     sizeof (ctime));
 2450         }
 2451 
 2452         /*
 2453          * Do this after setting timestamps to prevent timestamp
 2454          * update from toggling bit
 2455          */
 2456 
 2457         if (xoap && (mask & ATTR_XVATTR)) {
 2458 
 2459                 /*
 2460                  * restore trimmed off masks
 2461                  * so that return masks can be set for caller.
 2462                  */
 2463 
 2464                 if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
 2465                         XVA_SET_REQ(xvap, XAT_APPENDONLY);
 2466                 }
 2467                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
 2468                         XVA_SET_REQ(xvap, XAT_NOUNLINK);
 2469                 }
 2470                 if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
 2471                         XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 2472                 }
 2473                 if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
 2474                         XVA_SET_REQ(xvap, XAT_NODUMP);
 2475                 }
 2476                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
 2477                         XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 2478                 }
 2479                 if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
 2480                         XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 2481                 }
 2482                 if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
 2483                         XVA_SET_REQ(xvap, XAT_PROJINHERIT);
 2484                 }
 2485 
 2486                 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 2487                         ASSERT(S_ISREG(ip->i_mode));
 2488 
 2489                 zfs_xvattr_set(zp, xvap, tx);
 2490         }
 2491 
 2492         if (fuid_dirtied)
 2493                 zfs_fuid_sync(zfsvfs, tx);
 2494 
 2495         if (mask != 0)
 2496                 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 2497 
 2498         mutex_exit(&zp->z_lock);
 2499         if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 2500                 mutex_exit(&zp->z_acl_lock);
 2501 
 2502         if (attrzp) {
 2503                 if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 2504                         mutex_exit(&attrzp->z_acl_lock);
 2505                 mutex_exit(&attrzp->z_lock);
 2506         }
 2507 out:
 2508         if (err == 0 && xattr_count > 0) {
 2509                 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 2510                     xattr_count, tx);
 2511                 ASSERT(err2 == 0);
 2512         }
 2513 
 2514         if (aclp)
 2515                 zfs_acl_free(aclp);
 2516 
 2517         if (fuidp) {
 2518                 zfs_fuid_info_free(fuidp);
 2519                 fuidp = NULL;
 2520         }
 2521 
 2522         if (err) {
 2523                 dmu_tx_abort(tx);
 2524                 if (attrzp)
 2525                         zrele(attrzp);
 2526                 if (err == ERESTART)
 2527                         goto top;
 2528         } else {
 2529                 if (count > 0)
 2530                         err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 2531                 dmu_tx_commit(tx);
 2532                 if (attrzp) {
 2533                         if (err2 == 0 && handle_eadir)
 2534                                 err = zfs_setattr_dir(attrzp);
 2535                         zrele(attrzp);
 2536                 }
 2537                 zfs_znode_update_vfs(zp);
 2538         }
 2539 
 2540 out2:
 2541         if (os->os_sync == ZFS_SYNC_ALWAYS)
 2542                 zil_commit(zilog, 0);
 2543 
 2544 out3:
 2545         kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
 2546         kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
 2547         kmem_free(tmpxvattr, sizeof (xvattr_t));
 2548         zfs_exit(zfsvfs, FTAG);
 2549         return (err);
 2550 }
 2551 
 2552 typedef struct zfs_zlock {
 2553         krwlock_t       *zl_rwlock;     /* lock we acquired */
 2554         znode_t         *zl_znode;      /* znode we held */
 2555         struct zfs_zlock *zl_next;      /* next in list */
 2556 } zfs_zlock_t;
 2557 
 2558 /*
 2559  * Drop locks and release vnodes that were held by zfs_rename_lock().
 2560  */
 2561 static void
 2562 zfs_rename_unlock(zfs_zlock_t **zlpp)
 2563 {
 2564         zfs_zlock_t *zl;
 2565 
 2566         while ((zl = *zlpp) != NULL) {
 2567                 if (zl->zl_znode != NULL)
 2568                         zfs_zrele_async(zl->zl_znode);
 2569                 rw_exit(zl->zl_rwlock);
 2570                 *zlpp = zl->zl_next;
 2571                 kmem_free(zl, sizeof (*zl));
 2572         }
 2573 }
 2574 
 2575 /*
 2576  * Search back through the directory tree, using the ".." entries.
 2577  * Lock each directory in the chain to prevent concurrent renames.
 2578  * Fail any attempt to move a directory into one of its own descendants.
 2579  * XXX - z_parent_lock can overlap with map or grow locks
 2580  */
 2581 static int
 2582 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 2583 {
 2584         zfs_zlock_t     *zl;
 2585         znode_t         *zp = tdzp;
 2586         uint64_t        rootid = ZTOZSB(zp)->z_root;
 2587         uint64_t        oidp = zp->z_id;
 2588         krwlock_t       *rwlp = &szp->z_parent_lock;
 2589         krw_t           rw = RW_WRITER;
 2590 
 2591         /*
 2592          * First pass write-locks szp and compares to zp->z_id.
 2593          * Later passes read-lock zp and compare to zp->z_parent.
 2594          */
 2595         do {
 2596                 if (!rw_tryenter(rwlp, rw)) {
 2597                         /*
 2598                          * Another thread is renaming in this path.
 2599                          * Note that if we are a WRITER, we don't have any
 2600                          * parent_locks held yet.
 2601                          */
 2602                         if (rw == RW_READER && zp->z_id > szp->z_id) {
 2603                                 /*
 2604                                  * Drop our locks and restart
 2605                                  */
 2606                                 zfs_rename_unlock(&zl);
 2607                                 *zlpp = NULL;
 2608                                 zp = tdzp;
 2609                                 oidp = zp->z_id;
 2610                                 rwlp = &szp->z_parent_lock;
 2611                                 rw = RW_WRITER;
 2612                                 continue;
 2613                         } else {
 2614                                 /*
 2615                                  * Wait for other thread to drop its locks
 2616                                  */
 2617                                 rw_enter(rwlp, rw);
 2618                         }
 2619                 }
 2620 
 2621                 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 2622                 zl->zl_rwlock = rwlp;
 2623                 zl->zl_znode = NULL;
 2624                 zl->zl_next = *zlpp;
 2625                 *zlpp = zl;
 2626 
 2627                 if (oidp == szp->z_id)          /* We're a descendant of szp */
 2628                         return (SET_ERROR(EINVAL));
 2629 
 2630                 if (oidp == rootid)             /* We've hit the top */
 2631                         return (0);
 2632 
 2633                 if (rw == RW_READER) {          /* i.e. not the first pass */
 2634                         int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
 2635                         if (error)
 2636                                 return (error);
 2637                         zl->zl_znode = zp;
 2638                 }
 2639                 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
 2640                     &oidp, sizeof (oidp));
 2641                 rwlp = &zp->z_parent_lock;
 2642                 rw = RW_READER;
 2643 
 2644         } while (zp->z_id != sdzp->z_id);
 2645 
 2646         return (0);
 2647 }
 2648 
 2649 /*
 2650  * Move an entry from the provided source directory to the target
 2651  * directory.  Change the entry name as indicated.
 2652  *
 2653  *      IN:     sdzp    - Source directory containing the "old entry".
 2654  *              snm     - Old entry name.
 2655  *              tdzp    - Target directory to contain the "new entry".
 2656  *              tnm     - New entry name.
 2657  *              cr      - credentials of caller.
 2658  *              flags   - case flags
 2659  *              rflags  - RENAME_* flags
 2660  *              wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
 2661  *              mnt_ns  - user namespace of the mount
 2662  *
 2663  *      RETURN: 0 on success, error code on failure.
 2664  *
 2665  * Timestamps:
 2666  *      sdzp,tdzp - ctime|mtime updated
 2667  */
 2668 int
 2669 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
 2670     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns)
 2671 {
 2672         znode_t         *szp, *tzp;
 2673         zfsvfs_t        *zfsvfs = ZTOZSB(sdzp);
 2674         zilog_t         *zilog;
 2675         zfs_dirlock_t   *sdl, *tdl;
 2676         dmu_tx_t        *tx;
 2677         zfs_zlock_t     *zl;
 2678         int             cmp, serr, terr;
 2679         int             error = 0;
 2680         int             zflg = 0;
 2681         boolean_t       waited = B_FALSE;
 2682         /* Needed for whiteout inode creation. */
 2683         boolean_t       fuid_dirtied;
 2684         zfs_acl_ids_t   acl_ids;
 2685         boolean_t       have_acl = B_FALSE;
 2686         znode_t         *wzp = NULL;
 2687 
 2688 
 2689         if (snm == NULL || tnm == NULL)
 2690                 return (SET_ERROR(EINVAL));
 2691 
 2692         if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 2693                 return (SET_ERROR(EINVAL));
 2694 
 2695         /* Already checked by Linux VFS, but just to make sure. */
 2696         if (rflags & RENAME_EXCHANGE &&
 2697             (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
 2698                 return (SET_ERROR(EINVAL));
 2699 
 2700         /*
 2701          * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
 2702          * right kind of vattr_t for the whiteout file. These are set
 2703          * internally by ZFS so should never be incorrect.
 2704          */
 2705         VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
 2706         VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
 2707         VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
 2708 
 2709         if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 2710                 return (error);
 2711         zilog = zfsvfs->z_log;
 2712 
 2713         if ((error = zfs_verify_zp(tdzp)) != 0) {
 2714                 zfs_exit(zfsvfs, FTAG);
 2715                 return (error);
 2716         }
 2717 
 2718         /*
 2719          * We check i_sb because snapshots and the ctldir must have different
 2720          * super blocks.
 2721          */
 2722         if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
 2723             zfsctl_is_node(ZTOI(tdzp))) {
 2724                 zfs_exit(zfsvfs, FTAG);
 2725                 return (SET_ERROR(EXDEV));
 2726         }
 2727 
 2728         if (zfsvfs->z_utf8 && u8_validate(tnm,
 2729             strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 2730                 zfs_exit(zfsvfs, FTAG);
 2731                 return (SET_ERROR(EILSEQ));
 2732         }
 2733 
 2734         if (flags & FIGNORECASE)
 2735                 zflg |= ZCILOOK;
 2736 
 2737 top:
 2738         szp = NULL;
 2739         tzp = NULL;
 2740         zl = NULL;
 2741 
 2742         /*
 2743          * This is to prevent the creation of links into attribute space
 2744          * by renaming a linked file into/outof an attribute directory.
 2745          * See the comment in zfs_link() for why this is considered bad.
 2746          */
 2747         if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 2748                 zfs_exit(zfsvfs, FTAG);
 2749                 return (SET_ERROR(EINVAL));
 2750         }
 2751 
 2752         /*
 2753          * Lock source and target directory entries.  To prevent deadlock,
 2754          * a lock ordering must be defined.  We lock the directory with
 2755          * the smallest object id first, or if it's a tie, the one with
 2756          * the lexically first name.
 2757          */
 2758         if (sdzp->z_id < tdzp->z_id) {
 2759                 cmp = -1;
 2760         } else if (sdzp->z_id > tdzp->z_id) {
 2761                 cmp = 1;
 2762         } else {
 2763                 /*
 2764                  * First compare the two name arguments without
 2765                  * considering any case folding.
 2766                  */
 2767                 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 2768 
 2769                 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 2770                 ASSERT(error == 0 || !zfsvfs->z_utf8);
 2771                 if (cmp == 0) {
 2772                         /*
 2773                          * POSIX: "If the old argument and the new argument
 2774                          * both refer to links to the same existing file,
 2775                          * the rename() function shall return successfully
 2776                          * and perform no other action."
 2777                          */
 2778                         zfs_exit(zfsvfs, FTAG);
 2779                         return (0);
 2780                 }
 2781                 /*
 2782                  * If the file system is case-folding, then we may
 2783                  * have some more checking to do.  A case-folding file
 2784                  * system is either supporting mixed case sensitivity
 2785                  * access or is completely case-insensitive.  Note
 2786                  * that the file system is always case preserving.
 2787                  *
 2788                  * In mixed sensitivity mode case sensitive behavior
 2789                  * is the default.  FIGNORECASE must be used to
 2790                  * explicitly request case insensitive behavior.
 2791                  *
 2792                  * If the source and target names provided differ only
 2793                  * by case (e.g., a request to rename 'tim' to 'Tim'),
 2794                  * we will treat this as a special case in the
 2795                  * case-insensitive mode: as long as the source name
 2796                  * is an exact match, we will allow this to proceed as
 2797                  * a name-change request.
 2798                  */
 2799                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 2800                     (zfsvfs->z_case == ZFS_CASE_MIXED &&
 2801                     flags & FIGNORECASE)) &&
 2802                     u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 2803                     &error) == 0) {
 2804                         /*
 2805                          * case preserving rename request, require exact
 2806                          * name matches
 2807                          */
 2808                         zflg |= ZCIEXACT;
 2809                         zflg &= ~ZCILOOK;
 2810                 }
 2811         }
 2812 
 2813         /*
 2814          * If the source and destination directories are the same, we should
 2815          * grab the z_name_lock of that directory only once.
 2816          */
 2817         if (sdzp == tdzp) {
 2818                 zflg |= ZHAVELOCK;
 2819                 rw_enter(&sdzp->z_name_lock, RW_READER);
 2820         }
 2821 
 2822         if (cmp < 0) {
 2823                 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 2824                     ZEXISTS | zflg, NULL, NULL);
 2825                 terr = zfs_dirent_lock(&tdl,
 2826                     tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 2827         } else {
 2828                 terr = zfs_dirent_lock(&tdl,
 2829                     tdzp, tnm, &tzp, zflg, NULL, NULL);
 2830                 serr = zfs_dirent_lock(&sdl,
 2831                     sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 2832                     NULL, NULL);
 2833         }
 2834 
 2835         if (serr) {
 2836                 /*
 2837                  * Source entry invalid or not there.
 2838                  */
 2839                 if (!terr) {
 2840                         zfs_dirent_unlock(tdl);
 2841                         if (tzp)
 2842                                 zrele(tzp);
 2843                 }
 2844 
 2845                 if (sdzp == tdzp)
 2846                         rw_exit(&sdzp->z_name_lock);
 2847 
 2848                 if (strcmp(snm, "..") == 0)
 2849                         serr = EINVAL;
 2850                 zfs_exit(zfsvfs, FTAG);
 2851                 return (serr);
 2852         }
 2853         if (terr) {
 2854                 zfs_dirent_unlock(sdl);
 2855                 zrele(szp);
 2856 
 2857                 if (sdzp == tdzp)
 2858                         rw_exit(&sdzp->z_name_lock);
 2859 
 2860                 if (strcmp(tnm, "..") == 0)
 2861                         terr = EINVAL;
 2862                 zfs_exit(zfsvfs, FTAG);
 2863                 return (terr);
 2864         }
 2865 
 2866         /*
 2867          * If we are using project inheritance, means if the directory has
 2868          * ZFS_PROJINHERIT set, then its descendant directories will inherit
 2869          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 2870          * such case, we only allow renames into our tree when the project
 2871          * IDs are the same.
 2872          */
 2873         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 2874             tdzp->z_projid != szp->z_projid) {
 2875                 error = SET_ERROR(EXDEV);
 2876                 goto out;
 2877         }
 2878 
 2879         /*
 2880          * Must have write access at the source to remove the old entry
 2881          * and write access at the target to create the new entry.
 2882          * Note that if target and source are the same, this can be
 2883          * done in a single check.
 2884          */
 2885         if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
 2886                 goto out;
 2887 
 2888         if (S_ISDIR(ZTOI(szp)->i_mode)) {
 2889                 /*
 2890                  * Check to make sure rename is valid.
 2891                  * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 2892                  */
 2893                 if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
 2894                         goto out;
 2895         }
 2896 
 2897         /*
 2898          * Does target exist?
 2899          */
 2900         if (tzp) {
 2901                 if (rflags & RENAME_NOREPLACE) {
 2902                         error = SET_ERROR(EEXIST);
 2903                         goto out;
 2904                 }
 2905                 /*
 2906                  * Source and target must be the same type (unless exchanging).
 2907                  */
 2908                 if (!(rflags & RENAME_EXCHANGE)) {
 2909                         boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
 2910                         boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
 2911 
 2912                         if (s_is_dir != t_is_dir) {
 2913                                 error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
 2914                                 goto out;
 2915                         }
 2916                 }
 2917                 /*
 2918                  * POSIX dictates that when the source and target
 2919                  * entries refer to the same file object, rename
 2920                  * must do nothing and exit without error.
 2921                  */
 2922                 if (szp->z_id == tzp->z_id) {
 2923                         error = 0;
 2924                         goto out;
 2925                 }
 2926         } else if (rflags & RENAME_EXCHANGE) {
 2927                 /* Target must exist for RENAME_EXCHANGE. */
 2928                 error = SET_ERROR(ENOENT);
 2929                 goto out;
 2930         }
 2931 
 2932         /* Set up inode creation for RENAME_WHITEOUT. */
 2933         if (rflags & RENAME_WHITEOUT) {
 2934                 /*
 2935                  * Whiteout files are not regular files or directories, so to
 2936                  * match zfs_create() we do not inherit the project id.
 2937                  */
 2938                 uint64_t wo_projid = ZFS_DEFAULT_PROJID;
 2939 
 2940                 error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
 2941                 if (error)
 2942                         goto out;
 2943 
 2944                 if (!have_acl) {
 2945                         error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
 2946                             &acl_ids, mnt_ns);
 2947                         if (error)
 2948                                 goto out;
 2949                         have_acl = B_TRUE;
 2950                 }
 2951 
 2952                 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
 2953                         error = SET_ERROR(EDQUOT);
 2954                         goto out;
 2955                 }
 2956         }
 2957 
 2958         tx = dmu_tx_create(zfsvfs->z_os);
 2959         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 2960         dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 2961         dmu_tx_hold_zap(tx, sdzp->z_id,
 2962             (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
 2963         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 2964         if (sdzp != tdzp) {
 2965                 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 2966                 zfs_sa_upgrade_txholds(tx, tdzp);
 2967         }
 2968         if (tzp) {
 2969                 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 2970                 zfs_sa_upgrade_txholds(tx, tzp);
 2971         }
 2972         if (rflags & RENAME_WHITEOUT) {
 2973                 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 2974                     ZFS_SA_BASE_ATTR_SIZE);
 2975 
 2976                 dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
 2977                 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 2978                 if (!zfsvfs->z_use_sa &&
 2979                     acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 2980                         dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 2981                             0, acl_ids.z_aclp->z_acl_bytes);
 2982                 }
 2983         }
 2984         fuid_dirtied = zfsvfs->z_fuid_dirty;
 2985         if (fuid_dirtied)
 2986                 zfs_fuid_txhold(zfsvfs, tx);
 2987         zfs_sa_upgrade_txholds(tx, szp);
 2988         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 2989         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 2990         if (error) {
 2991                 if (zl != NULL)
 2992                         zfs_rename_unlock(&zl);
 2993                 zfs_dirent_unlock(sdl);
 2994                 zfs_dirent_unlock(tdl);
 2995 
 2996                 if (sdzp == tdzp)
 2997                         rw_exit(&sdzp->z_name_lock);
 2998 
 2999                 if (error == ERESTART) {
 3000                         waited = B_TRUE;
 3001                         dmu_tx_wait(tx);
 3002                         dmu_tx_abort(tx);
 3003                         zrele(szp);
 3004                         if (tzp)
 3005                                 zrele(tzp);
 3006                         goto top;
 3007                 }
 3008                 dmu_tx_abort(tx);
 3009                 zrele(szp);
 3010                 if (tzp)
 3011                         zrele(tzp);
 3012                 zfs_exit(zfsvfs, FTAG);
 3013                 return (error);
 3014         }
 3015 
 3016         /*
 3017          * Unlink the source.
 3018          */
 3019         szp->z_pflags |= ZFS_AV_MODIFIED;
 3020         if (tdzp->z_pflags & ZFS_PROJINHERIT)
 3021                 szp->z_pflags |= ZFS_PROJINHERIT;
 3022 
 3023         error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 3024             (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 3025         VERIFY0(error);
 3026 
 3027         error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 3028         if (error)
 3029                 goto commit;
 3030 
 3031         /*
 3032          * Unlink the target.
 3033          */
 3034         if (tzp) {
 3035                 int tzflg = zflg;
 3036 
 3037                 if (rflags & RENAME_EXCHANGE) {
 3038                         /* This inode will be re-linked soon. */
 3039                         tzflg |= ZRENAMING;
 3040 
 3041                         tzp->z_pflags |= ZFS_AV_MODIFIED;
 3042                         if (sdzp->z_pflags & ZFS_PROJINHERIT)
 3043                                 tzp->z_pflags |= ZFS_PROJINHERIT;
 3044 
 3045                         error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 3046                             (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
 3047                         ASSERT0(error);
 3048                 }
 3049                 error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
 3050                 if (error)
 3051                         goto commit_link_szp;
 3052         }
 3053 
 3054         /*
 3055          * Create the new target links:
 3056          *   * We always link the target.
 3057          *   * RENAME_EXCHANGE: Link the old target to the source.
 3058          *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
 3059          */
 3060         error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 3061         if (error) {
 3062                 /*
 3063                  * If we have removed the existing target, a subsequent call to
 3064                  * zfs_link_create() to add back the same entry, but with a new
 3065                  * dnode (szp), should not fail.
 3066                  */
 3067                 ASSERT3P(tzp, ==, NULL);
 3068                 goto commit_link_tzp;
 3069         }
 3070 
 3071         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 3072         case RENAME_EXCHANGE:
 3073                 error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
 3074                 /*
 3075                  * The same argument as zfs_link_create() failing for
 3076                  * szp applies here, since the source directory must
 3077                  * have had an entry we are replacing.
 3078                  */
 3079                 ASSERT0(error);
 3080                 if (error)
 3081                         goto commit_unlink_td_szp;
 3082                 break;
 3083         case RENAME_WHITEOUT:
 3084                 zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
 3085                 error = zfs_link_create(sdl, wzp, tx, ZNEW);
 3086                 if (error) {
 3087                         zfs_znode_delete(wzp, tx);
 3088                         remove_inode_hash(ZTOI(wzp));
 3089                         goto commit_unlink_td_szp;
 3090                 }
 3091                 break;
 3092         }
 3093 
 3094         if (fuid_dirtied)
 3095                 zfs_fuid_sync(zfsvfs, tx);
 3096 
 3097         switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 3098         case RENAME_EXCHANGE:
 3099                 zfs_log_rename_exchange(zilog, tx,
 3100                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 3101                     tdzp, tdl->dl_name, szp);
 3102                 break;
 3103         case RENAME_WHITEOUT:
 3104                 zfs_log_rename_whiteout(zilog, tx,
 3105                     (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 3106                     tdzp, tdl->dl_name, szp, wzp);
 3107                 break;
 3108         default:
 3109                 ASSERT0(rflags & ~RENAME_NOREPLACE);
 3110                 zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
 3111                     sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
 3112                 break;
 3113         }
 3114 
 3115 commit:
 3116         dmu_tx_commit(tx);
 3117 out:
 3118         if (have_acl)
 3119                 zfs_acl_ids_free(&acl_ids);
 3120 
 3121         zfs_znode_update_vfs(sdzp);
 3122         if (sdzp == tdzp)
 3123                 rw_exit(&sdzp->z_name_lock);
 3124 
 3125         if (sdzp != tdzp)
 3126                 zfs_znode_update_vfs(tdzp);
 3127 
 3128         zfs_znode_update_vfs(szp);
 3129         zrele(szp);
 3130         if (wzp) {
 3131                 zfs_znode_update_vfs(wzp);
 3132                 zrele(wzp);
 3133         }
 3134         if (tzp) {
 3135                 zfs_znode_update_vfs(tzp);
 3136                 zrele(tzp);
 3137         }
 3138 
 3139         if (zl != NULL)
 3140                 zfs_rename_unlock(&zl);
 3141 
 3142         zfs_dirent_unlock(sdl);
 3143         zfs_dirent_unlock(tdl);
 3144 
 3145         if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 3146                 zil_commit(zilog, 0);
 3147 
 3148         zfs_exit(zfsvfs, FTAG);
 3149         return (error);
 3150 
 3151         /*
 3152          * Clean-up path for broken link state.
 3153          *
 3154          * At this point we are in a (very) bad state, so we need to do our
 3155          * best to correct the state. In particular, all of the nlinks are
 3156          * wrong because we were destroying and creating links with ZRENAMING.
 3157          *
 3158          * In some form, all of these operations have to resolve the state:
 3159          *
 3160          *  * link_destroy() *must* succeed. Fortunately, this is very likely
 3161          *    since we only just created it.
 3162          *
 3163          *  * link_create()s are allowed to fail (though they shouldn't because
 3164          *    we only just unlinked them and are putting the entries back
 3165          *    during clean-up). But if they fail, we can just forcefully drop
 3166          *    the nlink value to (at the very least) avoid broken nlink values
 3167          *    -- though in the case of non-empty directories we will have to
 3168          *    panic (otherwise we'd have a leaked directory with a broken ..).
 3169          */
 3170 commit_unlink_td_szp:
 3171         VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
 3172 commit_link_tzp:
 3173         if (tzp) {
 3174                 if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
 3175                         VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
 3176         }
 3177 commit_link_szp:
 3178         if (zfs_link_create(sdl, szp, tx, ZRENAMING))
 3179                 VERIFY0(zfs_drop_nlink(szp, tx, NULL));
 3180         goto commit;
 3181 }
 3182 
 3183 /*
 3184  * Insert the indicated symbolic reference entry into the directory.
 3185  *
 3186  *      IN:     dzp     - Directory to contain new symbolic link.
 3187  *              name    - Name of directory entry in dip.
 3188  *              vap     - Attributes of new entry.
 3189  *              link    - Name for new symlink entry.
 3190  *              cr      - credentials of caller.
 3191  *              flags   - case flags
 3192  *              mnt_ns  - user namespace of the mount
 3193  *
 3194  *      OUT:    zpp     - Znode for new symbolic link.
 3195  *
 3196  *      RETURN: 0 on success, error code on failure.
 3197  *
 3198  * Timestamps:
 3199  *      dip - ctime|mtime updated
 3200  */
 3201 int
 3202 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
 3203     znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns)
 3204 {
 3205         znode_t         *zp;
 3206         zfs_dirlock_t   *dl;
 3207         dmu_tx_t        *tx;
 3208         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 3209         zilog_t         *zilog;
 3210         uint64_t        len = strlen(link);
 3211         int             error;
 3212         int             zflg = ZNEW;
 3213         zfs_acl_ids_t   acl_ids;
 3214         boolean_t       fuid_dirtied;
 3215         uint64_t        txtype = TX_SYMLINK;
 3216         boolean_t       waited = B_FALSE;
 3217 
 3218         ASSERT(S_ISLNK(vap->va_mode));
 3219 
 3220         if (name == NULL)
 3221                 return (SET_ERROR(EINVAL));
 3222 
 3223         if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 3224                 return (error);
 3225         zilog = zfsvfs->z_log;
 3226 
 3227         if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 3228             NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 3229                 zfs_exit(zfsvfs, FTAG);
 3230                 return (SET_ERROR(EILSEQ));
 3231         }
 3232         if (flags & FIGNORECASE)
 3233                 zflg |= ZCILOOK;
 3234 
 3235         if (len > MAXPATHLEN) {
 3236                 zfs_exit(zfsvfs, FTAG);
 3237                 return (SET_ERROR(ENAMETOOLONG));
 3238         }
 3239 
 3240         if ((error = zfs_acl_ids_create(dzp, 0,
 3241             vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
 3242                 zfs_exit(zfsvfs, FTAG);
 3243                 return (error);
 3244         }
 3245 top:
 3246         *zpp = NULL;
 3247 
 3248         /*
 3249          * Attempt to lock directory; fail if entry already exists.
 3250          */
 3251         error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 3252         if (error) {
 3253                 zfs_acl_ids_free(&acl_ids);
 3254                 zfs_exit(zfsvfs, FTAG);
 3255                 return (error);
 3256         }
 3257 
 3258         if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 3259                 zfs_acl_ids_free(&acl_ids);
 3260                 zfs_dirent_unlock(dl);
 3261                 zfs_exit(zfsvfs, FTAG);
 3262                 return (error);
 3263         }
 3264 
 3265         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
 3266                 zfs_acl_ids_free(&acl_ids);
 3267                 zfs_dirent_unlock(dl);
 3268                 zfs_exit(zfsvfs, FTAG);
 3269                 return (SET_ERROR(EDQUOT));
 3270         }
 3271         tx = dmu_tx_create(zfsvfs->z_os);
 3272         fuid_dirtied = zfsvfs->z_fuid_dirty;
 3273         dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 3274         dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 3275         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 3276             ZFS_SA_BASE_ATTR_SIZE + len);
 3277         dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 3278         if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 3279                 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 3280                     acl_ids.z_aclp->z_acl_bytes);
 3281         }
 3282         if (fuid_dirtied)
 3283                 zfs_fuid_txhold(zfsvfs, tx);
 3284         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 3285         if (error) {
 3286                 zfs_dirent_unlock(dl);
 3287                 if (error == ERESTART) {
 3288                         waited = B_TRUE;
 3289                         dmu_tx_wait(tx);
 3290                         dmu_tx_abort(tx);
 3291                         goto top;
 3292                 }
 3293                 zfs_acl_ids_free(&acl_ids);
 3294                 dmu_tx_abort(tx);
 3295                 zfs_exit(zfsvfs, FTAG);
 3296                 return (error);
 3297         }
 3298 
 3299         /*
 3300          * Create a new object for the symlink.
 3301          * for version 4 ZPL datasets the symlink will be an SA attribute
 3302          */
 3303         zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 3304 
 3305         if (fuid_dirtied)
 3306                 zfs_fuid_sync(zfsvfs, tx);
 3307 
 3308         mutex_enter(&zp->z_lock);
 3309         if (zp->z_is_sa)
 3310                 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 3311                     link, len, tx);
 3312         else
 3313                 zfs_sa_symlink(zp, link, len, tx);
 3314         mutex_exit(&zp->z_lock);
 3315 
 3316         zp->z_size = len;
 3317         (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 3318             &zp->z_size, sizeof (zp->z_size), tx);
 3319         /*
 3320          * Insert the new object into the directory.
 3321          */
 3322         error = zfs_link_create(dl, zp, tx, ZNEW);
 3323         if (error != 0) {
 3324                 zfs_znode_delete(zp, tx);
 3325                 remove_inode_hash(ZTOI(zp));
 3326         } else {
 3327                 if (flags & FIGNORECASE)
 3328                         txtype |= TX_CI;
 3329                 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 3330 
 3331                 zfs_znode_update_vfs(dzp);
 3332                 zfs_znode_update_vfs(zp);
 3333         }
 3334 
 3335         zfs_acl_ids_free(&acl_ids);
 3336 
 3337         dmu_tx_commit(tx);
 3338 
 3339         zfs_dirent_unlock(dl);
 3340 
 3341         if (error == 0) {
 3342                 *zpp = zp;
 3343 
 3344                 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 3345                         zil_commit(zilog, 0);
 3346         } else {
 3347                 zrele(zp);
 3348         }
 3349 
 3350         zfs_exit(zfsvfs, FTAG);
 3351         return (error);
 3352 }
 3353 
 3354 /*
 3355  * Return, in the buffer contained in the provided uio structure,
 3356  * the symbolic path referred to by ip.
 3357  *
 3358  *      IN:     ip      - inode of symbolic link
 3359  *              uio     - structure to contain the link path.
 3360  *              cr      - credentials of caller.
 3361  *
 3362  *      RETURN: 0 if success
 3363  *              error code if failure
 3364  *
 3365  * Timestamps:
 3366  *      ip - atime updated
 3367  */
 3368 int
 3369 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
 3370 {
 3371         (void) cr;
 3372         znode_t         *zp = ITOZ(ip);
 3373         zfsvfs_t        *zfsvfs = ITOZSB(ip);
 3374         int             error;
 3375 
 3376         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 3377                 return (error);
 3378 
 3379         mutex_enter(&zp->z_lock);
 3380         if (zp->z_is_sa)
 3381                 error = sa_lookup_uio(zp->z_sa_hdl,
 3382                     SA_ZPL_SYMLINK(zfsvfs), uio);
 3383         else
 3384                 error = zfs_sa_readlink(zp, uio);
 3385         mutex_exit(&zp->z_lock);
 3386 
 3387         zfs_exit(zfsvfs, FTAG);
 3388         return (error);
 3389 }
 3390 
 3391 /*
 3392  * Insert a new entry into directory tdzp referencing szp.
 3393  *
 3394  *      IN:     tdzp    - Directory to contain new entry.
 3395  *              szp     - znode of new entry.
 3396  *              name    - name of new entry.
 3397  *              cr      - credentials of caller.
 3398  *              flags   - case flags.
 3399  *
 3400  *      RETURN: 0 if success
 3401  *              error code if failure
 3402  *
 3403  * Timestamps:
 3404  *      tdzp - ctime|mtime updated
 3405  *       szp - ctime updated
 3406  */
 3407 int
 3408 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
 3409     int flags)
 3410 {
 3411         struct inode *sip = ZTOI(szp);
 3412         znode_t         *tzp;
 3413         zfsvfs_t        *zfsvfs = ZTOZSB(tdzp);
 3414         zilog_t         *zilog;
 3415         zfs_dirlock_t   *dl;
 3416         dmu_tx_t        *tx;
 3417         int             error;
 3418         int             zf = ZNEW;
 3419         uint64_t        parent;
 3420         uid_t           owner;
 3421         boolean_t       waited = B_FALSE;
 3422         boolean_t       is_tmpfile = 0;
 3423         uint64_t        txg;
 3424 #ifdef HAVE_TMPFILE
 3425         is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
 3426 #endif
 3427         ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
 3428 
 3429         if (name == NULL)
 3430                 return (SET_ERROR(EINVAL));
 3431 
 3432         if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 3433                 return (error);
 3434         zilog = zfsvfs->z_log;
 3435 
 3436         /*
 3437          * POSIX dictates that we return EPERM here.
 3438          * Better choices include ENOTSUP or EISDIR.
 3439          */
 3440         if (S_ISDIR(sip->i_mode)) {
 3441                 zfs_exit(zfsvfs, FTAG);
 3442                 return (SET_ERROR(EPERM));
 3443         }
 3444 
 3445         if ((error = zfs_verify_zp(szp)) != 0) {
 3446                 zfs_exit(zfsvfs, FTAG);
 3447                 return (error);
 3448         }
 3449 
 3450         /*
 3451          * If we are using project inheritance, means if the directory has
 3452          * ZFS_PROJINHERIT set, then its descendant directories will inherit
 3453          * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 3454          * such case, we only allow hard link creation in our tree when the
 3455          * project IDs are the same.
 3456          */
 3457         if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 3458             tdzp->z_projid != szp->z_projid) {
 3459                 zfs_exit(zfsvfs, FTAG);
 3460                 return (SET_ERROR(EXDEV));
 3461         }
 3462 
 3463         /*
 3464          * We check i_sb because snapshots and the ctldir must have different
 3465          * super blocks.
 3466          */
 3467         if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
 3468                 zfs_exit(zfsvfs, FTAG);
 3469                 return (SET_ERROR(EXDEV));
 3470         }
 3471 
 3472         /* Prevent links to .zfs/shares files */
 3473 
 3474         if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 3475             &parent, sizeof (uint64_t))) != 0) {
 3476                 zfs_exit(zfsvfs, FTAG);
 3477                 return (error);
 3478         }
 3479         if (parent == zfsvfs->z_shares_dir) {
 3480                 zfs_exit(zfsvfs, FTAG);
 3481                 return (SET_ERROR(EPERM));
 3482         }
 3483 
 3484         if (zfsvfs->z_utf8 && u8_validate(name,
 3485             strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 3486                 zfs_exit(zfsvfs, FTAG);
 3487                 return (SET_ERROR(EILSEQ));
 3488         }
 3489         if (flags & FIGNORECASE)
 3490                 zf |= ZCILOOK;
 3491 
 3492         /*
 3493          * We do not support links between attributes and non-attributes
 3494          * because of the potential security risk of creating links
 3495          * into "normal" file space in order to circumvent restrictions
 3496          * imposed in attribute space.
 3497          */
 3498         if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
 3499                 zfs_exit(zfsvfs, FTAG);
 3500                 return (SET_ERROR(EINVAL));
 3501         }
 3502 
 3503         owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
 3504             cr, ZFS_OWNER);
 3505         if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
 3506                 zfs_exit(zfsvfs, FTAG);
 3507                 return (SET_ERROR(EPERM));
 3508         }
 3509 
 3510         if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
 3511             kcred->user_ns))) {
 3512                 zfs_exit(zfsvfs, FTAG);
 3513                 return (error);
 3514         }
 3515 
 3516 top:
 3517         /*
 3518          * Attempt to lock directory; fail if entry already exists.
 3519          */
 3520         error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
 3521         if (error) {
 3522                 zfs_exit(zfsvfs, FTAG);
 3523                 return (error);
 3524         }
 3525 
 3526         tx = dmu_tx_create(zfsvfs->z_os);
 3527         dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 3528         dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
 3529         if (is_tmpfile)
 3530                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 3531 
 3532         zfs_sa_upgrade_txholds(tx, szp);
 3533         zfs_sa_upgrade_txholds(tx, tdzp);
 3534         error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 3535         if (error) {
 3536                 zfs_dirent_unlock(dl);
 3537                 if (error == ERESTART) {
 3538                         waited = B_TRUE;
 3539                         dmu_tx_wait(tx);
 3540                         dmu_tx_abort(tx);
 3541                         goto top;
 3542                 }
 3543                 dmu_tx_abort(tx);
 3544                 zfs_exit(zfsvfs, FTAG);
 3545                 return (error);
 3546         }
 3547         /* unmark z_unlinked so zfs_link_create will not reject */
 3548         if (is_tmpfile)
 3549                 szp->z_unlinked = B_FALSE;
 3550         error = zfs_link_create(dl, szp, tx, 0);
 3551 
 3552         if (error == 0) {
 3553                 uint64_t txtype = TX_LINK;
 3554                 /*
 3555                  * tmpfile is created to be in z_unlinkedobj, so remove it.
 3556                  * Also, we don't log in ZIL, because all previous file
 3557                  * operation on the tmpfile are ignored by ZIL. Instead we
 3558                  * always wait for txg to sync to make sure all previous
 3559                  * operation are sync safe.
 3560                  */
 3561                 if (is_tmpfile) {
 3562                         VERIFY(zap_remove_int(zfsvfs->z_os,
 3563                             zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
 3564                 } else {
 3565                         if (flags & FIGNORECASE)
 3566                                 txtype |= TX_CI;
 3567                         zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
 3568                 }
 3569         } else if (is_tmpfile) {
 3570                 /* restore z_unlinked since when linking failed */
 3571                 szp->z_unlinked = B_TRUE;
 3572         }
 3573         txg = dmu_tx_get_txg(tx);
 3574         dmu_tx_commit(tx);
 3575 
 3576         zfs_dirent_unlock(dl);
 3577 
 3578         if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 3579                 zil_commit(zilog, 0);
 3580 
 3581         if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
 3582                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
 3583 
 3584         zfs_znode_update_vfs(tdzp);
 3585         zfs_znode_update_vfs(szp);
 3586         zfs_exit(zfsvfs, FTAG);
 3587         return (error);
 3588 }
 3589 
 3590 static void
 3591 zfs_putpage_sync_commit_cb(void *arg)
 3592 {
 3593         struct page *pp = arg;
 3594 
 3595         ClearPageError(pp);
 3596         end_page_writeback(pp);
 3597 }
 3598 
 3599 static void
 3600 zfs_putpage_async_commit_cb(void *arg)
 3601 {
 3602         struct page *pp = arg;
 3603         znode_t *zp = ITOZ(pp->mapping->host);
 3604 
 3605         ClearPageError(pp);
 3606         end_page_writeback(pp);
 3607         atomic_dec_32(&zp->z_async_writes_cnt);
 3608 }
 3609 
 3610 /*
 3611  * Push a page out to disk, once the page is on stable storage the
 3612  * registered commit callback will be run as notification of completion.
 3613  *
 3614  *      IN:     ip       - page mapped for inode.
 3615  *              pp       - page to push (page is locked)
 3616  *              wbc      - writeback control data
 3617  *              for_sync - does the caller intend to wait synchronously for the
 3618  *                         page writeback to complete?
 3619  *
 3620  *      RETURN: 0 if success
 3621  *              error code if failure
 3622  *
 3623  * Timestamps:
 3624  *      ip - ctime|mtime updated
 3625  */
 3626 int
 3627 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 3628     boolean_t for_sync)
 3629 {
 3630         znode_t         *zp = ITOZ(ip);
 3631         zfsvfs_t        *zfsvfs = ITOZSB(ip);
 3632         loff_t          offset;
 3633         loff_t          pgoff;
 3634         unsigned int    pglen;
 3635         dmu_tx_t        *tx;
 3636         caddr_t         va;
 3637         int             err = 0;
 3638         uint64_t        mtime[2], ctime[2];
 3639         sa_bulk_attr_t  bulk[3];
 3640         int             cnt = 0;
 3641         struct address_space *mapping;
 3642 
 3643         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 3644                 return (err);
 3645 
 3646         ASSERT(PageLocked(pp));
 3647 
 3648         pgoff = page_offset(pp);        /* Page byte-offset in file */
 3649         offset = i_size_read(ip);       /* File length in bytes */
 3650         pglen = MIN(PAGE_SIZE,          /* Page length in bytes */
 3651             P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
 3652 
 3653         /* Page is beyond end of file */
 3654         if (pgoff >= offset) {
 3655                 unlock_page(pp);
 3656                 zfs_exit(zfsvfs, FTAG);
 3657                 return (0);
 3658         }
 3659 
 3660         /* Truncate page length to end of file */
 3661         if (pgoff + pglen > offset)
 3662                 pglen = offset - pgoff;
 3663 
 3664 #if 0
 3665         /*
 3666          * FIXME: Allow mmap writes past its quota.  The correct fix
 3667          * is to register a page_mkwrite() handler to count the page
 3668          * against its quota when it is about to be dirtied.
 3669          */
 3670         if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 3671             KUID_TO_SUID(ip->i_uid)) ||
 3672             zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 3673             KGID_TO_SGID(ip->i_gid)) ||
 3674             (zp->z_projid != ZFS_DEFAULT_PROJID &&
 3675             zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 3676             zp->z_projid))) {
 3677                 err = EDQUOT;
 3678         }
 3679 #endif
 3680 
 3681         /*
 3682          * The ordering here is critical and must adhere to the following
 3683          * rules in order to avoid deadlocking in either zfs_read() or
 3684          * zfs_free_range() due to a lock inversion.
 3685          *
 3686          * 1) The page must be unlocked prior to acquiring the range lock.
 3687          *    This is critical because zfs_read() calls find_lock_page()
 3688          *    which may block on the page lock while holding the range lock.
 3689          *
 3690          * 2) Before setting or clearing write back on a page the range lock
 3691          *    must be held in order to prevent a lock inversion with the
 3692          *    zfs_free_range() function.
 3693          *
 3694          * This presents a problem because upon entering this function the
 3695          * page lock is already held.  To safely acquire the range lock the
 3696          * page lock must be dropped.  This creates a window where another
 3697          * process could truncate, invalidate, dirty, or write out the page.
 3698          *
 3699          * Therefore, after successfully reacquiring the range and page locks
 3700          * the current page state is checked.  In the common case everything
 3701          * will be as is expected and it can be written out.  However, if
 3702          * the page state has changed it must be handled accordingly.
 3703          */
 3704         mapping = pp->mapping;
 3705         redirty_page_for_writepage(wbc, pp);
 3706         unlock_page(pp);
 3707 
 3708         zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 3709             pgoff, pglen, RL_WRITER);
 3710         lock_page(pp);
 3711 
 3712         /* Page mapping changed or it was no longer dirty, we're done */
 3713         if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
 3714                 unlock_page(pp);
 3715                 zfs_rangelock_exit(lr);
 3716                 zfs_exit(zfsvfs, FTAG);
 3717                 return (0);
 3718         }
 3719 
 3720         /* Another process started write block if required */
 3721         if (PageWriteback(pp)) {
 3722                 unlock_page(pp);
 3723                 zfs_rangelock_exit(lr);
 3724 
 3725                 if (wbc->sync_mode != WB_SYNC_NONE) {
 3726                         /*
 3727                          * Speed up any non-sync page writebacks since
 3728                          * they may take several seconds to complete.
 3729                          * Refer to the comment in zpl_fsync() (when
 3730                          * HAVE_FSYNC_RANGE is defined) for details.
 3731                          */
 3732                         if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
 3733                                 zil_commit(zfsvfs->z_log, zp->z_id);
 3734                         }
 3735 
 3736                         if (PageWriteback(pp))
 3737 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 3738                                 folio_wait_bit(page_folio(pp), PG_writeback);
 3739 #else
 3740                                 wait_on_page_bit(pp, PG_writeback);
 3741 #endif
 3742                 }
 3743 
 3744                 zfs_exit(zfsvfs, FTAG);
 3745                 return (0);
 3746         }
 3747 
 3748         /* Clear the dirty flag the required locks are held */
 3749         if (!clear_page_dirty_for_io(pp)) {
 3750                 unlock_page(pp);
 3751                 zfs_rangelock_exit(lr);
 3752                 zfs_exit(zfsvfs, FTAG);
 3753                 return (0);
 3754         }
 3755 
 3756         /*
 3757          * Counterpart for redirty_page_for_writepage() above.  This page
 3758          * was in fact not skipped and should not be counted as if it were.
 3759          */
 3760         wbc->pages_skipped--;
 3761         if (!for_sync)
 3762                 atomic_inc_32(&zp->z_async_writes_cnt);
 3763         set_page_writeback(pp);
 3764         unlock_page(pp);
 3765 
 3766         tx = dmu_tx_create(zfsvfs->z_os);
 3767         dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
 3768         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 3769         zfs_sa_upgrade_txholds(tx, zp);
 3770 
 3771         err = dmu_tx_assign(tx, TXG_NOWAIT);
 3772         if (err != 0) {
 3773                 if (err == ERESTART)
 3774                         dmu_tx_wait(tx);
 3775 
 3776                 dmu_tx_abort(tx);
 3777 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 3778                 filemap_dirty_folio(page_mapping(pp), page_folio(pp));
 3779 #else
 3780                 __set_page_dirty_nobuffers(pp);
 3781 #endif
 3782                 ClearPageError(pp);
 3783                 end_page_writeback(pp);
 3784                 if (!for_sync)
 3785                         atomic_dec_32(&zp->z_async_writes_cnt);
 3786                 zfs_rangelock_exit(lr);
 3787                 zfs_exit(zfsvfs, FTAG);
 3788                 return (err);
 3789         }
 3790 
 3791         va = kmap(pp);
 3792         ASSERT3U(pglen, <=, PAGE_SIZE);
 3793         dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
 3794         kunmap(pp);
 3795 
 3796         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 3797         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 3798         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
 3799             &zp->z_pflags, 8);
 3800 
 3801         /* Preserve the mtime and ctime provided by the inode */
 3802         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
 3803         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
 3804         zp->z_atime_dirty = B_FALSE;
 3805         zp->z_seq++;
 3806 
 3807         err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 3808 
 3809         zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
 3810             for_sync ? zfs_putpage_sync_commit_cb :
 3811             zfs_putpage_async_commit_cb, pp);
 3812 
 3813         dmu_tx_commit(tx);
 3814 
 3815         zfs_rangelock_exit(lr);
 3816 
 3817         if (wbc->sync_mode != WB_SYNC_NONE) {
 3818                 /*
 3819                  * Note that this is rarely called under writepages(), because
 3820                  * writepages() normally handles the entire commit for
 3821                  * performance reasons.
 3822                  */
 3823                 zil_commit(zfsvfs->z_log, zp->z_id);
 3824         } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
 3825                 /*
 3826                  * If the caller does not intend to wait synchronously
 3827                  * for this page writeback to complete and there are active
 3828                  * synchronous calls on this file, do a commit so that
 3829                  * the latter don't accidentally end up waiting for
 3830                  * our writeback to complete. Refer to the comment in
 3831                  * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
 3832                  */
 3833                 zil_commit(zfsvfs->z_log, zp->z_id);
 3834         }
 3835 
 3836         dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
 3837 
 3838         zfs_exit(zfsvfs, FTAG);
 3839         return (err);
 3840 }
 3841 
 3842 /*
 3843  * Update the system attributes when the inode has been dirtied.  For the
 3844  * moment we only update the mode, atime, mtime, and ctime.
 3845  */
 3846 int
 3847 zfs_dirty_inode(struct inode *ip, int flags)
 3848 {
 3849         znode_t         *zp = ITOZ(ip);
 3850         zfsvfs_t        *zfsvfs = ITOZSB(ip);
 3851         dmu_tx_t        *tx;
 3852         uint64_t        mode, atime[2], mtime[2], ctime[2];
 3853         sa_bulk_attr_t  bulk[4];
 3854         int             error = 0;
 3855         int             cnt = 0;
 3856 
 3857         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 3858                 return (0);
 3859 
 3860         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 3861                 return (error);
 3862 
 3863 #ifdef I_DIRTY_TIME
 3864         /*
 3865          * This is the lazytime semantic introduced in Linux 4.0
 3866          * This flag will only be called from update_time when lazytime is set.
 3867          * (Note, I_DIRTY_SYNC will also set if not lazytime)
 3868          * Fortunately mtime and ctime are managed within ZFS itself, so we
 3869          * only need to dirty atime.
 3870          */
 3871         if (flags == I_DIRTY_TIME) {
 3872                 zp->z_atime_dirty = B_TRUE;
 3873                 goto out;
 3874         }
 3875 #endif
 3876 
 3877         tx = dmu_tx_create(zfsvfs->z_os);
 3878 
 3879         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 3880         zfs_sa_upgrade_txholds(tx, zp);
 3881 
 3882         error = dmu_tx_assign(tx, TXG_WAIT);
 3883         if (error) {
 3884                 dmu_tx_abort(tx);
 3885                 goto out;
 3886         }
 3887 
 3888         mutex_enter(&zp->z_lock);
 3889         zp->z_atime_dirty = B_FALSE;
 3890 
 3891         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 3892         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 3893         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 3894         SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 3895 
 3896         /* Preserve the mode, mtime and ctime provided by the inode */
 3897         ZFS_TIME_ENCODE(&ip->i_atime, atime);
 3898         ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
 3899         ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
 3900         mode = ip->i_mode;
 3901 
 3902         zp->z_mode = mode;
 3903 
 3904         error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 3905         mutex_exit(&zp->z_lock);
 3906 
 3907         dmu_tx_commit(tx);
 3908 out:
 3909         zfs_exit(zfsvfs, FTAG);
 3910         return (error);
 3911 }
 3912 
 3913 void
 3914 zfs_inactive(struct inode *ip)
 3915 {
 3916         znode_t *zp = ITOZ(ip);
 3917         zfsvfs_t *zfsvfs = ITOZSB(ip);
 3918         uint64_t atime[2];
 3919         int error;
 3920         int need_unlock = 0;
 3921 
 3922         /* Only read lock if we haven't already write locked, e.g. rollback */
 3923         if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
 3924                 need_unlock = 1;
 3925                 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 3926         }
 3927         if (zp->z_sa_hdl == NULL) {
 3928                 if (need_unlock)
 3929                         rw_exit(&zfsvfs->z_teardown_inactive_lock);
 3930                 return;
 3931         }
 3932 
 3933         if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
 3934                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 3935 
 3936                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 3937                 zfs_sa_upgrade_txholds(tx, zp);
 3938                 error = dmu_tx_assign(tx, TXG_WAIT);
 3939                 if (error) {
 3940                         dmu_tx_abort(tx);
 3941                 } else {
 3942                         ZFS_TIME_ENCODE(&ip->i_atime, atime);
 3943                         mutex_enter(&zp->z_lock);
 3944                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 3945                             (void *)&atime, sizeof (atime), tx);
 3946                         zp->z_atime_dirty = B_FALSE;
 3947                         mutex_exit(&zp->z_lock);
 3948                         dmu_tx_commit(tx);
 3949                 }
 3950         }
 3951 
 3952         zfs_zinactive(zp);
 3953         if (need_unlock)
 3954                 rw_exit(&zfsvfs->z_teardown_inactive_lock);
 3955 }
 3956 
 3957 /*
 3958  * Fill pages with data from the disk.
 3959  */
 3960 static int
 3961 zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
 3962 {
 3963         znode_t *zp = ITOZ(ip);
 3964         zfsvfs_t *zfsvfs = ITOZSB(ip);
 3965         objset_t *os;
 3966         struct page *cur_pp;
 3967         u_offset_t io_off, total;
 3968         size_t io_len;
 3969         loff_t i_size;
 3970         unsigned page_idx;
 3971         int err;
 3972 
 3973         os = zfsvfs->z_os;
 3974         io_len = nr_pages << PAGE_SHIFT;
 3975         i_size = i_size_read(ip);
 3976         io_off = page_offset(pl[0]);
 3977 
 3978         if (io_off + io_len > i_size)
 3979                 io_len = i_size - io_off;
 3980 
 3981         /*
 3982          * Iterate over list of pages and read each page individually.
 3983          */
 3984         page_idx = 0;
 3985         for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
 3986                 caddr_t va;
 3987 
 3988                 cur_pp = pl[page_idx++];
 3989                 va = kmap(cur_pp);
 3990                 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
 3991                     DMU_READ_PREFETCH);
 3992                 kunmap(cur_pp);
 3993                 if (err) {
 3994                         /* convert checksum errors into IO errors */
 3995                         if (err == ECKSUM)
 3996                                 err = SET_ERROR(EIO);
 3997                         return (err);
 3998                 }
 3999         }
 4000 
 4001         return (0);
 4002 }
 4003 
 4004 /*
 4005  * Uses zfs_fillpage to read data from the file and fill the pages.
 4006  *
 4007  *      IN:     ip       - inode of file to get data from.
 4008  *              pl       - list of pages to read
 4009  *              nr_pages - number of pages to read
 4010  *
 4011  *      RETURN: 0 on success, error code on failure.
 4012  *
 4013  * Timestamps:
 4014  *      vp - atime updated
 4015  */
 4016 int
 4017 zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
 4018 {
 4019         znode_t  *zp  = ITOZ(ip);
 4020         zfsvfs_t *zfsvfs = ITOZSB(ip);
 4021         int      err;
 4022 
 4023         if (pl == NULL)
 4024                 return (0);
 4025 
 4026         if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 4027                 return (err);
 4028 
 4029         err = zfs_fillpage(ip, pl, nr_pages);
 4030 
 4031         dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr_pages*PAGESIZE);
 4032 
 4033         zfs_exit(zfsvfs, FTAG);
 4034         return (err);
 4035 }
 4036 
 4037 /*
 4038  * Check ZFS specific permissions to memory map a section of a file.
 4039  *
 4040  *      IN:     ip      - inode of the file to mmap
 4041  *              off     - file offset
 4042  *              addrp   - start address in memory region
 4043  *              len     - length of memory region
 4044  *              vm_flags- address flags
 4045  *
 4046  *      RETURN: 0 if success
 4047  *              error code if failure
 4048  */
 4049 int
 4050 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
 4051     unsigned long vm_flags)
 4052 {
 4053         (void) addrp;
 4054         znode_t  *zp = ITOZ(ip);
 4055         zfsvfs_t *zfsvfs = ITOZSB(ip);
 4056         int error;
 4057 
 4058         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 4059                 return (error);
 4060 
 4061         if ((vm_flags & VM_WRITE) && (zp->z_pflags &
 4062             (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 4063                 zfs_exit(zfsvfs, FTAG);
 4064                 return (SET_ERROR(EPERM));
 4065         }
 4066 
 4067         if ((vm_flags & (VM_READ | VM_EXEC)) &&
 4068             (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 4069                 zfs_exit(zfsvfs, FTAG);
 4070                 return (SET_ERROR(EACCES));
 4071         }
 4072 
 4073         if (off < 0 || len > MAXOFFSET_T - off) {
 4074                 zfs_exit(zfsvfs, FTAG);
 4075                 return (SET_ERROR(ENXIO));
 4076         }
 4077 
 4078         zfs_exit(zfsvfs, FTAG);
 4079         return (0);
 4080 }
 4081 
 4082 /*
 4083  * Free or allocate space in a file.  Currently, this function only
 4084  * supports the `F_FREESP' command.  However, this command is somewhat
 4085  * misnamed, as its functionality includes the ability to allocate as
 4086  * well as free space.
 4087  *
 4088  *      IN:     zp      - znode of file to free data in.
 4089  *              cmd     - action to take (only F_FREESP supported).
 4090  *              bfp     - section of file to free/alloc.
 4091  *              flag    - current file open mode flags.
 4092  *              offset  - current file offset.
 4093  *              cr      - credentials of caller.
 4094  *
 4095  *      RETURN: 0 on success, error code on failure.
 4096  *
 4097  * Timestamps:
 4098  *      zp - ctime|mtime updated
 4099  */
 4100 int
 4101 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
 4102     offset_t offset, cred_t *cr)
 4103 {
 4104         (void) offset;
 4105         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
 4106         uint64_t        off, len;
 4107         int             error;
 4108 
 4109         if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 4110                 return (error);
 4111 
 4112         if (cmd != F_FREESP) {
 4113                 zfs_exit(zfsvfs, FTAG);
 4114                 return (SET_ERROR(EINVAL));
 4115         }
 4116 
 4117         /*
 4118          * Callers might not be able to detect properly that we are read-only,
 4119          * so check it explicitly here.
 4120          */
 4121         if (zfs_is_readonly(zfsvfs)) {
 4122                 zfs_exit(zfsvfs, FTAG);
 4123                 return (SET_ERROR(EROFS));
 4124         }
 4125 
 4126         if (bfp->l_len < 0) {
 4127                 zfs_exit(zfsvfs, FTAG);
 4128                 return (SET_ERROR(EINVAL));
 4129         }
 4130 
 4131         /*
 4132          * Permissions aren't checked on Solaris because on this OS
 4133          * zfs_space() can only be called with an opened file handle.
 4134          * On Linux we can get here through truncate_range() which
 4135          * operates directly on inodes, so we need to check access rights.
 4136          */
 4137         if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
 4138             kcred->user_ns))) {
 4139                 zfs_exit(zfsvfs, FTAG);
 4140                 return (error);
 4141         }
 4142 
 4143         off = bfp->l_start;
 4144         len = bfp->l_len; /* 0 means from off to end of file */
 4145 
 4146         error = zfs_freesp(zp, off, len, flag, TRUE);
 4147 
 4148         zfs_exit(zfsvfs, FTAG);
 4149         return (error);
 4150 }
 4151 
 4152 int
 4153 zfs_fid(struct inode *ip, fid_t *fidp)
 4154 {
 4155         znode_t         *zp = ITOZ(ip);
 4156         zfsvfs_t        *zfsvfs = ITOZSB(ip);
 4157         uint32_t        gen;
 4158         uint64_t        gen64;
 4159         uint64_t        object = zp->z_id;
 4160         zfid_short_t    *zfid;
 4161         int             size, i, error;
 4162 
 4163         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 4164                 return (error);
 4165 
 4166         if (fidp->fid_len < SHORT_FID_LEN) {
 4167                 fidp->fid_len = SHORT_FID_LEN;
 4168                 zfs_exit(zfsvfs, FTAG);
 4169                 return (SET_ERROR(ENOSPC));
 4170         }
 4171 
 4172         if ((error = zfs_verify_zp(zp)) != 0) {
 4173                 zfs_exit(zfsvfs, FTAG);
 4174                 return (error);
 4175         }
 4176 
 4177         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 4178             &gen64, sizeof (uint64_t))) != 0) {
 4179                 zfs_exit(zfsvfs, FTAG);
 4180                 return (error);
 4181         }
 4182 
 4183         gen = (uint32_t)gen64;
 4184 
 4185         size = SHORT_FID_LEN;
 4186 
 4187         zfid = (zfid_short_t *)fidp;
 4188 
 4189         zfid->zf_len = size;
 4190 
 4191         for (i = 0; i < sizeof (zfid->zf_object); i++)
 4192                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 4193 
 4194         /* Must have a non-zero generation number to distinguish from .zfs */
 4195         if (gen == 0)
 4196                 gen = 1;
 4197         for (i = 0; i < sizeof (zfid->zf_gen); i++)
 4198                 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 4199 
 4200         zfs_exit(zfsvfs, FTAG);
 4201         return (0);
 4202 }
 4203 
 4204 #if defined(_KERNEL)
 4205 EXPORT_SYMBOL(zfs_open);
 4206 EXPORT_SYMBOL(zfs_close);
 4207 EXPORT_SYMBOL(zfs_lookup);
 4208 EXPORT_SYMBOL(zfs_create);
 4209 EXPORT_SYMBOL(zfs_tmpfile);
 4210 EXPORT_SYMBOL(zfs_remove);
 4211 EXPORT_SYMBOL(zfs_mkdir);
 4212 EXPORT_SYMBOL(zfs_rmdir);
 4213 EXPORT_SYMBOL(zfs_readdir);
 4214 EXPORT_SYMBOL(zfs_getattr_fast);
 4215 EXPORT_SYMBOL(zfs_setattr);
 4216 EXPORT_SYMBOL(zfs_rename);
 4217 EXPORT_SYMBOL(zfs_symlink);
 4218 EXPORT_SYMBOL(zfs_readlink);
 4219 EXPORT_SYMBOL(zfs_link);
 4220 EXPORT_SYMBOL(zfs_inactive);
 4221 EXPORT_SYMBOL(zfs_space);
 4222 EXPORT_SYMBOL(zfs_fid);
 4223 EXPORT_SYMBOL(zfs_getpage);
 4224 EXPORT_SYMBOL(zfs_putpage);
 4225 EXPORT_SYMBOL(zfs_dirty_inode);
 4226 EXPORT_SYMBOL(zfs_map);
 4227 
 4228 /* CSTYLED */
 4229 module_param(zfs_delete_blocks, ulong, 0644);
 4230 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 4231 
 4232 #endif

Cache object: 3bcd62ce81e16da5a46421d65c70708f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.