zfs_dir.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   24  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
   25  * Copyright 2017 Nexenta Systems, Inc.
   26  */
   27 
   28 #include <sys/types.h>
   29 #include <sys/param.h>
   30 #include <sys/time.h>
   31 #include <sys/sysmacros.h>
   32 #include <sys/vfs.h>
   33 #include <sys/vnode.h>
   34 #include <sys/file.h>
   35 #include <sys/kmem.h>
   36 #include <sys/uio.h>
   37 #include <sys/pathname.h>
   38 #include <sys/cmn_err.h>
   39 #include <sys/errno.h>
   40 #include <sys/stat.h>
   41 #include <sys/sunddi.h>
   42 #include <sys/random.h>
   43 #include <sys/policy.h>
   44 #include <sys/zfs_dir.h>
   45 #include <sys/zfs_acl.h>
   46 #include <sys/zfs_vnops.h>
   47 #include <sys/fs/zfs.h>
   48 #include <sys/zap.h>
   49 #include <sys/dmu.h>
   50 #include <sys/atomic.h>
   51 #include <sys/zfs_ctldir.h>
   52 #include <sys/zfs_fuid.h>
   53 #include <sys/sa.h>
   54 #include <sys/zfs_sa.h>
   55 #include <sys/dmu_objset.h>
   56 #include <sys/dsl_dir.h>
   57 
   58 /*
   59  * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
   60  * of names after deciding which is the appropriate lookup interface.
   61  */
   62 static int
   63 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
   64     matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,
   65     uint64_t *zoid)
   66 {
   67         boolean_t conflict = B_FALSE;
   68         int error;
   69 
   70         if (zfsvfs->z_norm) {
   71                 size_t bufsz = 0;
   72                 char *buf = NULL;
   73 
   74                 if (rpnp) {
   75                         buf = rpnp->pn_buf;
   76                         bufsz = rpnp->pn_bufsize;
   77                 }
   78 
   79                 /*
   80                  * In the non-mixed case we only expect there would ever
   81                  * be one match, but we need to use the normalizing lookup.
   82                  */
   83                 error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
   84                     zoid, mt, buf, bufsz, &conflict);
   85         } else {
   86                 error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
   87         }
   88 
   89         /*
   90          * Allow multiple entries provided the first entry is
   91          * the object id.  Non-zpl consumers may safely make
   92          * use of the additional space.
   93          *
   94          * XXX: This should be a feature flag for compatibility
   95          */
   96         if (error == EOVERFLOW)
   97                 error = 0;
   98 
   99         if (zfsvfs->z_norm && !error && deflags)
  100                 *deflags = conflict ? ED_CASE_CONFLICT : 0;
  101 
  102         *zoid = ZFS_DIRENT_OBJ(*zoid);
  103 
  104         return (error);
  105 }
  106 
  107 /*
  108  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
  109  * in dzp's directory zap object.  As long as you hold a dirlock, you can
  110  * assume two things: (1) dzp cannot be reaped, and (2) no other thread
  111  * can change the zap entry for (i.e. link or unlink) this name.
  112  *
  113  * Input arguments:
  114  *      dzp     - znode for directory
  115  *      name    - name of entry to lock
  116  *      flag    - ZNEW: if the entry already exists, fail with EEXIST.
  117  *                ZEXISTS: if the entry does not exist, fail with ENOENT.
  118  *                ZSHARED: allow concurrent access with other ZSHARED callers.
  119  *                ZXATTR: we want dzp's xattr directory
  120  *                ZCILOOK: On a mixed sensitivity file system,
  121  *                         this lookup should be case-insensitive.
  122  *                ZCIEXACT: On a purely case-insensitive file system,
  123  *                          this lookup should be case-sensitive.
  124  *                ZRENAMING: we are locking for renaming, force narrow locks
  125  *                ZHAVELOCK: Don't grab the z_name_lock for this call. The
  126  *                           current thread already holds it.
  127  *
  128  * Output arguments:
  129  *      zpp     - pointer to the znode for the entry (NULL if there isn't one)
  130  *      dlpp    - pointer to the dirlock for this entry (NULL on error)
  131  *      direntflags - (case-insensitive lookup only)
  132  *              flags if multiple case-sensitive matches exist in directory
  133  *      realpnp     - (case-insensitive lookup only)
  134  *              actual name matched within the directory
  135  *
  136  * Return value: 0 on success or errno on failure.
  137  *
  138  * NOTE: Always checks for, and rejects, '.' and '..'.
  139  * NOTE: For case-insensitive file systems we take wide locks (see below),
  140  *       but return znode pointers to a single match.
  141  */
  142 int
  143 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,
  144     znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)
  145 {
  146         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
  147         zfs_dirlock_t   *dl;
  148         boolean_t       update;
  149         matchtype_t     mt = 0;
  150         uint64_t        zoid;
  151         int             error = 0;
  152         int             cmpflags;
  153 
  154         *zpp = NULL;
  155         *dlpp = NULL;
  156 
  157         /*
  158          * Verify that we are not trying to lock '.', '..', or '.zfs'
  159          */
  160         if ((name[0] == '.' &&
  161             (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
  162             (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
  163                 return (SET_ERROR(EEXIST));
  164 
  165         /*
  166          * Case sensitivity and normalization preferences are set when
  167          * the file system is created.  These are stored in the
  168          * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
  169          * affect what vnodes can be cached in the DNLC, how we
  170          * perform zap lookups, and the "width" of our dirlocks.
  171          *
  172          * A normal dirlock locks a single name.  Note that with
  173          * normalization a name can be composed multiple ways, but
  174          * when normalized, these names all compare equal.  A wide
  175          * dirlock locks multiple names.  We need these when the file
  176          * system is supporting mixed-mode access.  It is sometimes
  177          * necessary to lock all case permutations of file name at
  178          * once so that simultaneous case-insensitive/case-sensitive
  179          * behaves as rationally as possible.
  180          */
  181 
  182         /*
  183          * When matching we may need to normalize & change case according to
  184          * FS settings.
  185          *
  186          * Note that a normalized match is necessary for a case insensitive
  187          * filesystem when the lookup request is not exact because normalization
  188          * can fold case independent of normalizing code point sequences.
  189          *
  190          * See the table above zfs_dropname().
  191          */
  192         if (zfsvfs->z_norm != 0) {
  193                 mt = MT_NORMALIZE;
  194 
  195                 /*
  196                  * Determine if the match needs to honor the case specified in
  197                  * lookup, and if so keep track of that so that during
  198                  * normalization we don't fold case.
  199                  */
  200                 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
  201                     (flag & ZCIEXACT)) ||
  202                     (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
  203                         mt |= MT_MATCH_CASE;
  204                 }
  205         }
  206 
  207         /*
  208          * Only look in or update the DNLC if we are looking for the
  209          * name on a file system that does not require normalization
  210          * or case folding.  We can also look there if we happen to be
  211          * on a non-normalizing, mixed sensitivity file system IF we
  212          * are looking for the exact name.
  213          *
  214          * Maybe can add TO-UPPERed version of name to dnlc in ci-only
  215          * case for performance improvement?
  216          */
  217         update = !zfsvfs->z_norm ||
  218             (zfsvfs->z_case == ZFS_CASE_MIXED &&
  219             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
  220 
  221         /*
  222          * ZRENAMING indicates we are in a situation where we should
  223          * take narrow locks regardless of the file system's
  224          * preferences for normalizing and case folding.  This will
  225          * prevent us deadlocking trying to grab the same wide lock
  226          * twice if the two names happen to be case-insensitive
  227          * matches.
  228          */
  229         if (flag & ZRENAMING)
  230                 cmpflags = 0;
  231         else
  232                 cmpflags = zfsvfs->z_norm;
  233 
  234         /*
  235          * Wait until there are no locks on this name.
  236          *
  237          * Don't grab the lock if it is already held. However, cannot
  238          * have both ZSHARED and ZHAVELOCK together.
  239          */
  240         ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
  241         if (!(flag & ZHAVELOCK))
  242                 rw_enter(&dzp->z_name_lock, RW_READER);
  243 
  244         mutex_enter(&dzp->z_lock);
  245         for (;;) {
  246                 if (dzp->z_unlinked && !(flag & ZXATTR)) {
  247                         mutex_exit(&dzp->z_lock);
  248                         if (!(flag & ZHAVELOCK))
  249                                 rw_exit(&dzp->z_name_lock);
  250                         return (SET_ERROR(ENOENT));
  251                 }
  252                 for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
  253                         if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
  254                             U8_UNICODE_LATEST, &error) == 0) || error != 0)
  255                                 break;
  256                 }
  257                 if (error != 0) {
  258                         mutex_exit(&dzp->z_lock);
  259                         if (!(flag & ZHAVELOCK))
  260                                 rw_exit(&dzp->z_name_lock);
  261                         return (SET_ERROR(ENOENT));
  262                 }
  263                 if (dl == NULL) {
  264                         /*
  265                          * Allocate a new dirlock and add it to the list.
  266                          */
  267                         dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
  268                         cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
  269                         dl->dl_name = name;
  270                         dl->dl_sharecnt = 0;
  271                         dl->dl_namelock = 0;
  272                         dl->dl_namesize = 0;
  273                         dl->dl_dzp = dzp;
  274                         dl->dl_next = dzp->z_dirlocks;
  275                         dzp->z_dirlocks = dl;
  276                         break;
  277                 }
  278                 if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
  279                         break;
  280                 cv_wait(&dl->dl_cv, &dzp->z_lock);
  281         }
  282 
  283         /*
  284          * If the z_name_lock was NOT held for this dirlock record it.
  285          */
  286         if (flag & ZHAVELOCK)
  287                 dl->dl_namelock = 1;
  288 
  289         if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
  290                 /*
  291                  * We're the second shared reference to dl.  Make a copy of
  292                  * dl_name in case the first thread goes away before we do.
  293                  * Note that we initialize the new name before storing its
  294                  * pointer into dl_name, because the first thread may load
  295                  * dl->dl_name at any time.  It'll either see the old value,
  296                  * which belongs to it, or the new shared copy; either is OK.
  297                  */
  298                 dl->dl_namesize = strlen(dl->dl_name) + 1;
  299                 name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
  300                 memcpy(name, dl->dl_name, dl->dl_namesize);
  301                 dl->dl_name = name;
  302         }
  303 
  304         mutex_exit(&dzp->z_lock);
  305 
  306         /*
  307          * We have a dirlock on the name.  (Note that it is the dirlock,
  308          * not the dzp's z_lock, that protects the name in the zap object.)
  309          * See if there's an object by this name; if so, put a hold on it.
  310          */
  311         if (flag & ZXATTR) {
  312                 error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
  313                     sizeof (zoid));
  314                 if (error == 0)
  315                         error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
  316         } else {
  317                 error = zfs_match_find(zfsvfs, dzp, name, mt,
  318                     update, direntflags, realpnp, &zoid);
  319         }
  320         if (error) {
  321                 if (error != ENOENT || (flag & ZEXISTS)) {
  322                         zfs_dirent_unlock(dl);
  323                         return (error);
  324                 }
  325         } else {
  326                 if (flag & ZNEW) {
  327                         zfs_dirent_unlock(dl);
  328                         return (SET_ERROR(EEXIST));
  329                 }
  330                 error = zfs_zget(zfsvfs, zoid, zpp);
  331                 if (error) {
  332                         zfs_dirent_unlock(dl);
  333                         return (error);
  334                 }
  335         }
  336 
  337         *dlpp = dl;
  338 
  339         return (0);
  340 }
  341 
  342 /*
  343  * Unlock this directory entry and wake anyone who was waiting for it.
  344  */
  345 void
  346 zfs_dirent_unlock(zfs_dirlock_t *dl)
  347 {
  348         znode_t *dzp = dl->dl_dzp;
  349         zfs_dirlock_t **prev_dl, *cur_dl;
  350 
  351         mutex_enter(&dzp->z_lock);
  352 
  353         if (!dl->dl_namelock)
  354                 rw_exit(&dzp->z_name_lock);
  355 
  356         if (dl->dl_sharecnt > 1) {
  357                 dl->dl_sharecnt--;
  358                 mutex_exit(&dzp->z_lock);
  359                 return;
  360         }
  361         prev_dl = &dzp->z_dirlocks;
  362         while ((cur_dl = *prev_dl) != dl)
  363                 prev_dl = &cur_dl->dl_next;
  364         *prev_dl = dl->dl_next;
  365         cv_broadcast(&dl->dl_cv);
  366         mutex_exit(&dzp->z_lock);
  367 
  368         if (dl->dl_namesize != 0)
  369                 kmem_free(dl->dl_name, dl->dl_namesize);
  370         cv_destroy(&dl->dl_cv);
  371         kmem_free(dl, sizeof (*dl));
  372 }
  373 
  374 /*
  375  * Look up an entry in a directory.
  376  *
  377  * NOTE: '.' and '..' are handled as special cases because
  378  *      no directory entries are actually stored for them.  If this is
  379  *      the root of a filesystem, then '.zfs' is also treated as a
  380  *      special pseudo-directory.
  381  */
  382 int
  383 zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
  384     int *deflg, pathname_t *rpnp)
  385 {
  386         zfs_dirlock_t *dl;
  387         znode_t *zp;
  388         struct inode *ip;
  389         int error = 0;
  390         uint64_t parent;
  391 
  392         if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
  393                 *zpp = dzp;
  394                 zhold(*zpp);
  395         } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
  396                 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
  397 
  398                 /*
  399                  * If we are a snapshot mounted under .zfs, return
  400                  * the inode pointer for the snapshot directory.
  401                  */
  402                 if ((error = sa_lookup(dzp->z_sa_hdl,
  403                     SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
  404                         return (error);
  405 
  406                 if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
  407                         error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
  408                             "snapshot", &ip, 0, kcred, NULL, NULL);
  409                         *zpp = ITOZ(ip);
  410                         return (error);
  411                 }
  412                 rw_enter(&dzp->z_parent_lock, RW_READER);
  413                 error = zfs_zget(zfsvfs, parent, &zp);
  414                 if (error == 0)
  415                         *zpp = zp;
  416                 rw_exit(&dzp->z_parent_lock);
  417         } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
  418                 ip = zfsctl_root(dzp);
  419                 *zpp = ITOZ(ip);
  420         } else {
  421                 int zf;
  422 
  423                 zf = ZEXISTS | ZSHARED;
  424                 if (flags & FIGNORECASE)
  425                         zf |= ZCILOOK;
  426 
  427                 error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
  428                 if (error == 0) {
  429                         *zpp = zp;
  430                         zfs_dirent_unlock(dl);
  431                         dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
  432                 }
  433                 rpnp = NULL;
  434         }
  435 
  436         if ((flags & FIGNORECASE) && rpnp && !error)
  437                 (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
  438 
  439         return (error);
  440 }
  441 
  442 /*
  443  * unlinked Set (formerly known as the "delete queue") Error Handling
  444  *
  445  * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
  446  * don't specify the name of the entry that we will be manipulating.  We
  447  * also fib and say that we won't be adding any new entries to the
  448  * unlinked set, even though we might (this is to lower the minimum file
  449  * size that can be deleted in a full filesystem).  So on the small
  450  * chance that the nlink list is using a fat zap (ie. has more than
  451  * 2000 entries), we *may* not pre-read a block that's needed.
  452  * Therefore it is remotely possible for some of the assertions
  453  * regarding the unlinked set below to fail due to i/o error.  On a
  454  * nondebug system, this will result in the space being leaked.
  455  */
  456 void
  457 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
  458 {
  459         zfsvfs_t *zfsvfs = ZTOZSB(zp);
  460 
  461         ASSERT(zp->z_unlinked);
  462         ASSERT(ZTOI(zp)->i_nlink == 0);
  463 
  464         VERIFY3U(0, ==,
  465             zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
  466 
  467         dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
  468 }
  469 
  470 /*
  471  * Clean up any znodes that had no links when we either crashed or
  472  * (force) umounted the file system.
  473  */
  474 static void
  475 zfs_unlinked_drain_task(void *arg)
  476 {
  477         zfsvfs_t *zfsvfs = arg;
  478         zap_cursor_t    zc;
  479         zap_attribute_t zap;
  480         dmu_object_info_t doi;
  481         znode_t         *zp;
  482         int             error;
  483 
  484         ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
  485 
  486         /*
  487          * Iterate over the contents of the unlinked set.
  488          */
  489         for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
  490             zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
  491             zap_cursor_advance(&zc)) {
  492 
  493                 /*
  494                  * See what kind of object we have in list
  495                  */
  496 
  497                 error = dmu_object_info(zfsvfs->z_os,
  498                     zap.za_first_integer, &doi);
  499                 if (error != 0)
  500                         continue;
  501 
  502                 ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
  503                     (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
  504                 /*
  505                  * We need to re-mark these list entries for deletion,
  506                  * so we pull them back into core and set zp->z_unlinked.
  507                  */
  508                 error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
  509 
  510                 /*
  511                  * We may pick up znodes that are already marked for deletion.
  512                  * This could happen during the purge of an extended attribute
  513                  * directory.  All we need to do is skip over them, since they
  514                  * are already in the system marked z_unlinked.
  515                  */
  516                 if (error != 0)
  517                         continue;
  518 
  519                 zp->z_unlinked = B_TRUE;
  520 
  521                 /*
  522                  * zrele() decrements the znode's ref count and may cause
  523                  * it to be synchronously freed. We interrupt freeing
  524                  * of this znode by checking the return value of
  525                  * dmu_objset_zfs_unmounting() in dmu_free_long_range()
  526                  * when an unmount is requested.
  527                  */
  528                 zrele(zp);
  529                 ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
  530         }
  531         zap_cursor_fini(&zc);
  532 
  533         zfsvfs->z_draining = B_FALSE;
  534         zfsvfs->z_drain_task = TASKQID_INVALID;
  535 }
  536 
  537 /*
  538  * Sets z_draining then tries to dispatch async unlinked drain.
  539  * If that fails executes synchronous unlinked drain.
  540  */
  541 void
  542 zfs_unlinked_drain(zfsvfs_t *zfsvfs)
  543 {
  544         ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
  545         ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
  546 
  547         zfsvfs->z_draining = B_TRUE;
  548         zfsvfs->z_drain_cancel = B_FALSE;
  549 
  550         zfsvfs->z_drain_task = taskq_dispatch(
  551             dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
  552             zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
  553         if (zfsvfs->z_drain_task == TASKQID_INVALID) {
  554                 zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
  555                 zfs_unlinked_drain_task(zfsvfs);
  556         }
  557 }
  558 
  559 /*
  560  * Wait for the unlinked drain taskq task to stop. This will interrupt the
  561  * unlinked set processing if it is in progress.
  562  */
  563 void
  564 zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
  565 {
  566         ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
  567 
  568         if (zfsvfs->z_draining) {
  569                 zfsvfs->z_drain_cancel = B_TRUE;
  570                 taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
  571                     dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
  572                 zfsvfs->z_drain_task = TASKQID_INVALID;
  573                 zfsvfs->z_draining = B_FALSE;
  574         }
  575 }
  576 
  577 /*
  578  * Delete the entire contents of a directory.  Return a count
  579  * of the number of entries that could not be deleted. If we encounter
  580  * an error, return a count of at least one so that the directory stays
  581  * in the unlinked set.
  582  *
  583  * NOTE: this function assumes that the directory is inactive,
  584  *      so there is no need to lock its entries before deletion.
  585  *      Also, it assumes the directory contents is *only* regular
  586  *      files.
  587  */
  588 static int
  589 zfs_purgedir(znode_t *dzp)
  590 {
  591         zap_cursor_t    zc;
  592         zap_attribute_t zap;
  593         znode_t         *xzp;
  594         dmu_tx_t        *tx;
  595         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
  596         zfs_dirlock_t   dl;
  597         int skipped = 0;
  598         int error;
  599 
  600         for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
  601             (error = zap_cursor_retrieve(&zc, &zap)) == 0;
  602             zap_cursor_advance(&zc)) {
  603                 error = zfs_zget(zfsvfs,
  604                     ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
  605                 if (error) {
  606                         skipped += 1;
  607                         continue;
  608                 }
  609 
  610                 ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
  611                     S_ISLNK(ZTOI(xzp)->i_mode));
  612 
  613                 tx = dmu_tx_create(zfsvfs->z_os);
  614                 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
  615                 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
  616                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
  617                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
  618                 /* Is this really needed ? */
  619                 zfs_sa_upgrade_txholds(tx, xzp);
  620                 dmu_tx_mark_netfree(tx);
  621                 error = dmu_tx_assign(tx, TXG_WAIT);
  622                 if (error) {
  623                         dmu_tx_abort(tx);
  624                         zfs_zrele_async(xzp);
  625                         skipped += 1;
  626                         continue;
  627                 }
  628                 memset(&dl, 0, sizeof (dl));
  629                 dl.dl_dzp = dzp;
  630                 dl.dl_name = zap.za_name;
  631 
  632                 error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
  633                 if (error)
  634                         skipped += 1;
  635                 dmu_tx_commit(tx);
  636 
  637                 zfs_zrele_async(xzp);
  638         }
  639         zap_cursor_fini(&zc);
  640         if (error != ENOENT)
  641                 skipped += 1;
  642         return (skipped);
  643 }
  644 
  645 void
  646 zfs_rmnode(znode_t *zp)
  647 {
  648         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
  649         objset_t        *os = zfsvfs->z_os;
  650         znode_t         *xzp = NULL;
  651         dmu_tx_t        *tx;
  652         znode_hold_t    *zh;
  653         uint64_t        z_id = zp->z_id;
  654         uint64_t        acl_obj;
  655         uint64_t        xattr_obj;
  656         uint64_t        links;
  657         int             error;
  658 
  659         ASSERT(ZTOI(zp)->i_nlink == 0);
  660         ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
  661 
  662         /*
  663          * If this is an attribute directory, purge its contents.
  664          */
  665         if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
  666                 if (zfs_purgedir(zp) != 0) {
  667                         /*
  668                          * Not enough space to delete some xattrs.
  669                          * Leave it in the unlinked set.
  670                          */
  671                         zh = zfs_znode_hold_enter(zfsvfs, z_id);
  672                         zfs_znode_dmu_fini(zp);
  673                         zfs_znode_hold_exit(zfsvfs, zh);
  674                         return;
  675                 }
  676         }
  677 
  678         /*
  679          * Free up all the data in the file.  We don't do this for directories
  680          * because we need truncate and remove to be in the same tx, like in
  681          * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
  682          * an inconsistent truncated zap object in the delete queue.  Note a
  683          * truncated file is harmless since it only contains user data.
  684          */
  685         if (S_ISREG(ZTOI(zp)->i_mode)) {
  686                 error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
  687                 if (error) {
  688                         /*
  689                          * Not enough space or we were interrupted by unmount.
  690                          * Leave the file in the unlinked set.
  691                          */
  692                         zh = zfs_znode_hold_enter(zfsvfs, z_id);
  693                         zfs_znode_dmu_fini(zp);
  694                         zfs_znode_hold_exit(zfsvfs, zh);
  695                         return;
  696                 }
  697         }
  698 
  699         /*
  700          * If the file has extended attributes, we're going to unlink
  701          * the xattr dir.
  702          */
  703         error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
  704             &xattr_obj, sizeof (xattr_obj));
  705         if (error == 0 && xattr_obj) {
  706                 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
  707                 ASSERT(error == 0);
  708         }
  709 
  710         acl_obj = zfs_external_acl(zp);
  711 
  712         /*
  713          * Set up the final transaction.
  714          */
  715         tx = dmu_tx_create(os);
  716         dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
  717         dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
  718         if (xzp) {
  719                 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
  720                 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
  721         }
  722         if (acl_obj)
  723                 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
  724 
  725         zfs_sa_upgrade_txholds(tx, zp);
  726         error = dmu_tx_assign(tx, TXG_WAIT);
  727         if (error) {
  728                 /*
  729                  * Not enough space to delete the file.  Leave it in the
  730                  * unlinked set, leaking it until the fs is remounted (at
  731                  * which point we'll call zfs_unlinked_drain() to process it).
  732                  */
  733                 dmu_tx_abort(tx);
  734                 zh = zfs_znode_hold_enter(zfsvfs, z_id);
  735                 zfs_znode_dmu_fini(zp);
  736                 zfs_znode_hold_exit(zfsvfs, zh);
  737                 goto out;
  738         }
  739 
  740         if (xzp) {
  741                 ASSERT(error == 0);
  742                 mutex_enter(&xzp->z_lock);
  743                 xzp->z_unlinked = B_TRUE;       /* mark xzp for deletion */
  744                 clear_nlink(ZTOI(xzp));         /* no more links to it */
  745                 links = 0;
  746                 VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
  747                     &links, sizeof (links), tx));
  748                 mutex_exit(&xzp->z_lock);
  749                 zfs_unlinked_add(xzp, tx);
  750         }
  751 
  752         mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
  753 
  754         /*
  755          * Remove this znode from the unlinked set.  If a has rollback has
  756          * occurred while a file is open and unlinked.  Then when the file
  757          * is closed post rollback it will not exist in the rolled back
  758          * version of the unlinked object.
  759          */
  760         error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
  761             zp->z_id, tx);
  762         VERIFY(error == 0 || error == ENOENT);
  763 
  764         uint64_t count;
  765         if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
  766                 cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
  767         }
  768 
  769         mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
  770 
  771         dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
  772 
  773         zfs_znode_delete(zp, tx);
  774 
  775         dmu_tx_commit(tx);
  776 out:
  777         if (xzp)
  778                 zfs_zrele_async(xzp);
  779 }
  780 
  781 static uint64_t
  782 zfs_dirent(znode_t *zp, uint64_t mode)
  783 {
  784         uint64_t de = zp->z_id;
  785 
  786         if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
  787                 de |= IFTODT(mode) << 60;
  788         return (de);
  789 }
  790 
  791 /*
  792  * Link zp into dl.  Can fail in the following cases :
  793  * - if zp has been unlinked.
  794  * - if the number of entries with the same hash (aka. colliding entries)
  795  *    exceed the capacity of a leaf-block of fatzap and splitting of the
  796  *    leaf-block does not help.
  797  */
  798 int
  799 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
  800 {
  801         znode_t *dzp = dl->dl_dzp;
  802         zfsvfs_t *zfsvfs = ZTOZSB(zp);
  803         uint64_t value;
  804         int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
  805         sa_bulk_attr_t bulk[5];
  806         uint64_t mtime[2], ctime[2];
  807         uint64_t links;
  808         int count = 0;
  809         int error;
  810 
  811         mutex_enter(&zp->z_lock);
  812 
  813         if (!(flag & ZRENAMING)) {
  814                 if (zp->z_unlinked) {   /* no new links to unlinked zp */
  815                         ASSERT(!(flag & (ZNEW | ZEXISTS)));
  816                         mutex_exit(&zp->z_lock);
  817                         return (SET_ERROR(ENOENT));
  818                 }
  819                 if (!(flag & ZNEW)) {
  820                         /*
  821                          * ZNEW nodes come from zfs_mknode() where the link
  822                          * count has already been initialised
  823                          */
  824                         inc_nlink(ZTOI(zp));
  825                         links = ZTOI(zp)->i_nlink;
  826                         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
  827                             NULL, &links, sizeof (links));
  828                 }
  829         }
  830 
  831         value = zfs_dirent(zp, zp->z_mode);
  832         error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
  833             &value, tx);
  834 
  835         /*
  836          * zap_add could fail to add the entry if it exceeds the capacity of the
  837          * leaf-block and zap_leaf_split() failed to help.
  838          * The caller of this routine is responsible for failing the transaction
  839          * which will rollback the SA updates done above.
  840          */
  841         if (error != 0) {
  842                 if (!(flag & ZRENAMING) && !(flag & ZNEW))
  843                         drop_nlink(ZTOI(zp));
  844                 mutex_exit(&zp->z_lock);
  845                 return (error);
  846         }
  847 
  848         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
  849             &dzp->z_id, sizeof (dzp->z_id));
  850         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
  851             &zp->z_pflags, sizeof (zp->z_pflags));
  852 
  853         if (!(flag & ZNEW)) {
  854                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
  855                     ctime, sizeof (ctime));
  856                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
  857                     ctime);
  858         }
  859         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
  860         ASSERT(error == 0);
  861 
  862         mutex_exit(&zp->z_lock);
  863 
  864         mutex_enter(&dzp->z_lock);
  865         dzp->z_size++;
  866         if (zp_is_dir)
  867                 inc_nlink(ZTOI(dzp));
  868         links = ZTOI(dzp)->i_nlink;
  869         count = 0;
  870         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
  871             &dzp->z_size, sizeof (dzp->z_size));
  872         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
  873             &links, sizeof (links));
  874         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
  875             mtime, sizeof (mtime));
  876         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
  877             ctime, sizeof (ctime));
  878         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
  879             &dzp->z_pflags, sizeof (dzp->z_pflags));
  880         zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
  881         error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
  882         ASSERT(error == 0);
  883         mutex_exit(&dzp->z_lock);
  884 
  885         return (0);
  886 }
  887 
  888 /*
  889  * The match type in the code for this function should conform to:
  890  *
  891  * ------------------------------------------------------------------------
  892  * fs type  | z_norm      | lookup type | match type
  893  * ---------|-------------|-------------|----------------------------------
  894  * CS !norm | 0           |           0 | 0 (exact)
  895  * CS  norm | formX       |           0 | MT_NORMALIZE
  896  * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
  897  * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  898  * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
  899  * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  900  * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  901  * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
  902  * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  903  * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
  904  *
  905  * Abbreviations:
  906  *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
  907  *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
  908  *    formX = unicode normalization form set on fs creation
  909  */
  910 static int
  911 zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
  912     int flag)
  913 {
  914         int error;
  915 
  916         if (ZTOZSB(zp)->z_norm) {
  917                 matchtype_t mt = MT_NORMALIZE;
  918 
  919                 if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
  920                     (flag & ZCIEXACT)) ||
  921                     (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
  922                     !(flag & ZCILOOK))) {
  923                         mt |= MT_MATCH_CASE;
  924                 }
  925 
  926                 error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
  927                     dl->dl_name, mt, tx);
  928         } else {
  929                 error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
  930                     tx);
  931         }
  932 
  933         return (error);
  934 }
  935 
  936 static int
  937 zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
  938 {
  939         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
  940         int             zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
  941         boolean_t       unlinked = B_FALSE;
  942         sa_bulk_attr_t  bulk[3];
  943         uint64_t        mtime[2], ctime[2];
  944         uint64_t        links;
  945         int             count = 0;
  946         int             error;
  947 
  948         if (zp_is_dir && !zfs_dirempty(zp))
  949                 return (SET_ERROR(ENOTEMPTY));
  950 
  951         if (ZTOI(zp)->i_nlink <= zp_is_dir) {
  952                 zfs_panic_recover("zfs: link count on %lu is %u, "
  953                     "should be at least %u", zp->z_id,
  954                     (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
  955                 set_nlink(ZTOI(zp), zp_is_dir + 1);
  956         }
  957         drop_nlink(ZTOI(zp));
  958         if (ZTOI(zp)->i_nlink == zp_is_dir) {
  959                 zp->z_unlinked = B_TRUE;
  960                 clear_nlink(ZTOI(zp));
  961                 unlinked = B_TRUE;
  962         } else {
  963                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
  964                     NULL, &ctime, sizeof (ctime));
  965                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
  966                     NULL, &zp->z_pflags, sizeof (zp->z_pflags));
  967                 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
  968                     ctime);
  969         }
  970         links = ZTOI(zp)->i_nlink;
  971         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
  972             NULL, &links, sizeof (links));
  973         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
  974         ASSERT3U(error, ==, 0);
  975 
  976         if (unlinkedp != NULL)
  977                 *unlinkedp = unlinked;
  978         else if (unlinked)
  979                 zfs_unlinked_add(zp, tx);
  980 
  981         return (0);
  982 }
  983 
  984 /*
  985  * Forcefully drop an nlink reference from (zp) and mark it for deletion if it
  986  * was the last link. This *must* only be done to znodes which have already
  987  * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in
  988  * the error path of zfs_rename(), where we have to correct the nlink count if
  989  * we failed to link the target as well as failing to re-link the original
  990  * znodes.
  991  */
  992 int
  993 zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
  994 {
  995         int error;
  996 
  997         mutex_enter(&zp->z_lock);
  998         error = zfs_drop_nlink_locked(zp, tx, unlinkedp);
  999         mutex_exit(&zp->z_lock);
 1000 
 1001         return (error);
 1002 }
 1003 
 1004 /*
 1005  * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
 1006  * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
 1007  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
 1008  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
 1009  * and it's the caller's job to do it.
 1010  */
 1011 int
 1012 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
 1013     boolean_t *unlinkedp)
 1014 {
 1015         znode_t *dzp = dl->dl_dzp;
 1016         zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 1017         int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
 1018         boolean_t unlinked = B_FALSE;
 1019         sa_bulk_attr_t bulk[5];
 1020         uint64_t mtime[2], ctime[2];
 1021         uint64_t links;
 1022         int count = 0;
 1023         int error;
 1024 
 1025         if (!(flag & ZRENAMING)) {
 1026                 mutex_enter(&zp->z_lock);
 1027 
 1028                 if (zp_is_dir && !zfs_dirempty(zp)) {
 1029                         mutex_exit(&zp->z_lock);
 1030                         return (SET_ERROR(ENOTEMPTY));
 1031                 }
 1032 
 1033                 /*
 1034                  * If we get here, we are going to try to remove the object.
 1035                  * First try removing the name from the directory; if that
 1036                  * fails, return the error.
 1037                  */
 1038                 error = zfs_dropname(dl, zp, dzp, tx, flag);
 1039                 if (error != 0) {
 1040                         mutex_exit(&zp->z_lock);
 1041                         return (error);
 1042                 }
 1043 
 1044                 /* The only error is !zfs_dirempty() and we checked earlier. */
 1045                 error = zfs_drop_nlink_locked(zp, tx, &unlinked);
 1046                 ASSERT3U(error, ==, 0);
 1047                 mutex_exit(&zp->z_lock);
 1048         } else {
 1049                 error = zfs_dropname(dl, zp, dzp, tx, flag);
 1050                 if (error != 0)
 1051                         return (error);
 1052         }
 1053 
 1054         mutex_enter(&dzp->z_lock);
 1055         dzp->z_size--;          /* one dirent removed */
 1056         if (zp_is_dir)
 1057                 drop_nlink(ZTOI(dzp));  /* ".." link from zp */
 1058         links = ZTOI(dzp)->i_nlink;
 1059         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 1060             NULL, &links, sizeof (links));
 1061         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 1062             NULL, &dzp->z_size, sizeof (dzp->z_size));
 1063         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
 1064             NULL, ctime, sizeof (ctime));
 1065         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 1066             NULL, mtime, sizeof (mtime));
 1067         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 1068             NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 1069         zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
 1070         error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
 1071         ASSERT(error == 0);
 1072         mutex_exit(&dzp->z_lock);
 1073 
 1074         if (unlinkedp != NULL)
 1075                 *unlinkedp = unlinked;
 1076         else if (unlinked)
 1077                 zfs_unlinked_add(zp, tx);
 1078 
 1079         return (0);
 1080 }
 1081 
 1082 /*
 1083  * Indicate whether the directory is empty.  Works with or without z_lock
 1084  * held, but can only be consider a hint in the latter case.  Returns true
 1085  * if only "." and ".." remain and there's no work in progress.
 1086  *
 1087  * The internal ZAP size, rather than zp->z_size, needs to be checked since
 1088  * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
 1089  */
 1090 boolean_t
 1091 zfs_dirempty(znode_t *dzp)
 1092 {
 1093         zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 1094         uint64_t count;
 1095         int error;
 1096 
 1097         if (dzp->z_dirlocks != NULL)
 1098                 return (B_FALSE);
 1099 
 1100         error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
 1101         if (error != 0 || count != 0)
 1102                 return (B_FALSE);
 1103 
 1104         return (B_TRUE);
 1105 }
 1106 
 1107 int
 1108 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
 1109 {
 1110         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 1111         znode_t *xzp;
 1112         dmu_tx_t *tx;
 1113         int error;
 1114         zfs_acl_ids_t acl_ids;
 1115         boolean_t fuid_dirtied;
 1116 #ifdef ZFS_DEBUG
 1117         uint64_t parent;
 1118 #endif
 1119 
 1120         *xzpp = NULL;
 1121 
 1122         if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
 1123             &acl_ids, kcred->user_ns)) != 0)
 1124                 return (error);
 1125         if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
 1126                 zfs_acl_ids_free(&acl_ids);
 1127                 return (SET_ERROR(EDQUOT));
 1128         }
 1129 
 1130         tx = dmu_tx_create(zfsvfs->z_os);
 1131         dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 1132             ZFS_SA_BASE_ATTR_SIZE);
 1133         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 1134         dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 1135         fuid_dirtied = zfsvfs->z_fuid_dirty;
 1136         if (fuid_dirtied)
 1137                 zfs_fuid_txhold(zfsvfs, tx);
 1138         error = dmu_tx_assign(tx, TXG_WAIT);
 1139         if (error) {
 1140                 zfs_acl_ids_free(&acl_ids);
 1141                 dmu_tx_abort(tx);
 1142                 return (error);
 1143         }
 1144         zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
 1145 
 1146         if (fuid_dirtied)
 1147                 zfs_fuid_sync(zfsvfs, tx);
 1148 
 1149 #ifdef ZFS_DEBUG
 1150         error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 1151             &parent, sizeof (parent));
 1152         ASSERT(error == 0 && parent == zp->z_id);
 1153 #endif
 1154 
 1155         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
 1156             sizeof (xzp->z_id), tx));
 1157 
 1158         if (!zp->z_unlinked)
 1159                 zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL,
 1160                     acl_ids.z_fuidp, vap);
 1161 
 1162         zfs_acl_ids_free(&acl_ids);
 1163         dmu_tx_commit(tx);
 1164 
 1165         *xzpp = xzp;
 1166 
 1167         return (0);
 1168 }
 1169 
 1170 /*
 1171  * Return a znode for the extended attribute directory for zp.
 1172  * ** If the directory does not already exist, it is created **
 1173  *
 1174  *      IN:     zp      - znode to obtain attribute directory from
 1175  *              cr      - credentials of caller
 1176  *              flags   - flags from the VOP_LOOKUP call
 1177  *
 1178  *      OUT:    xipp    - pointer to extended attribute znode
 1179  *
 1180  *      RETURN: 0 on success
 1181  *              error number on failure
 1182  */
 1183 int
 1184 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
 1185 {
 1186         zfsvfs_t        *zfsvfs = ZTOZSB(zp);
 1187         znode_t         *xzp;
 1188         zfs_dirlock_t   *dl;
 1189         vattr_t         va;
 1190         int             error;
 1191 top:
 1192         error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
 1193         if (error)
 1194                 return (error);
 1195 
 1196         if (xzp != NULL) {
 1197                 *xzpp = xzp;
 1198                 zfs_dirent_unlock(dl);
 1199                 return (0);
 1200         }
 1201 
 1202         if (!(flags & CREATE_XATTR_DIR)) {
 1203                 zfs_dirent_unlock(dl);
 1204                 return (SET_ERROR(ENOENT));
 1205         }
 1206 
 1207         if (zfs_is_readonly(zfsvfs)) {
 1208                 zfs_dirent_unlock(dl);
 1209                 return (SET_ERROR(EROFS));
 1210         }
 1211 
 1212         /*
 1213          * The ability to 'create' files in an attribute
 1214          * directory comes from the write_xattr permission on the base file.
 1215          *
 1216          * The ability to 'search' an attribute directory requires
 1217          * read_xattr permission on the base file.
 1218          *
 1219          * Once in a directory the ability to read/write attributes
 1220          * is controlled by the permissions on the attribute file.
 1221          */
 1222         va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
 1223         va.va_mode = S_IFDIR | S_ISVTX | 0777;
 1224         zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 1225 
 1226         va.va_dentry = NULL;
 1227         error = zfs_make_xattrdir(zp, &va, xzpp, cr);
 1228         zfs_dirent_unlock(dl);
 1229 
 1230         if (error == ERESTART) {
 1231                 /* NB: we already did dmu_tx_wait() if necessary */
 1232                 goto top;
 1233         }
 1234 
 1235         return (error);
 1236 }
 1237 
 1238 /*
 1239  * Decide whether it is okay to remove within a sticky directory.
 1240  *
 1241  * In sticky directories, write access is not sufficient;
 1242  * you can remove entries from a directory only if:
 1243  *
 1244  *      you own the directory,
 1245  *      you own the entry,
 1246  *      you have write access to the entry,
 1247  *      or you are privileged (checked in secpolicy...).
 1248  *
 1249  * The function returns 0 if remove access is granted.
 1250  */
 1251 int
 1252 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 1253 {
 1254         uid_t           uid;
 1255         uid_t           downer;
 1256         uid_t           fowner;
 1257         zfsvfs_t        *zfsvfs = ZTOZSB(zdp);
 1258 
 1259         if (zfsvfs->z_replay)
 1260                 return (0);
 1261 
 1262         if ((zdp->z_mode & S_ISVTX) == 0)
 1263                 return (0);
 1264 
 1265         downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
 1266             cr, ZFS_OWNER);
 1267         fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
 1268             cr, ZFS_OWNER);
 1269 
 1270         if ((uid = crgetuid(cr)) == downer || uid == fowner ||
 1271             zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
 1272             kcred->user_ns) == 0)
 1273                 return (0);
 1274         else
 1275                 return (secpolicy_vnode_remove(cr));
 1276 }
Cache object: fe6dadbebd667bb7b7cdcf85906cb61b
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c