The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/contrib/openzfs/module/zfs/zap.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or https://opensource.org/licenses/CDDL-1.0.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 /*
   22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   25  */
   26 
   27 /*
   28  * This file contains the top half of the zfs directory structure
   29  * implementation. The bottom half is in zap_leaf.c.
   30  *
   31  * The zdir is an extendable hash data structure. There is a table of
   32  * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
   33  * each a constant size and hold a variable number of directory entries.
   34  * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
   35  *
   36  * The pointer table holds a power of 2 number of pointers.
   37  * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
   38  * by the pointer at index i in the table holds entries whose hash value
   39  * has a zd_prefix_len - bit prefix
   40  */
   41 
   42 #include <sys/spa.h>
   43 #include <sys/dmu.h>
   44 #include <sys/zfs_context.h>
   45 #include <sys/zfs_znode.h>
   46 #include <sys/fs/zfs.h>
   47 #include <sys/zap.h>
   48 #include <sys/zap_impl.h>
   49 #include <sys/zap_leaf.h>
   50 
   51 /*
   52  * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
   53  * (all leaf blocks) when we start iterating over it.
   54  *
   55  * For zap_cursor_init(), the callers all intend to iterate through all the
   56  * entries.  There are a few cases where an error (typically i/o error) could
   57  * cause it to bail out early.
   58  *
   59  * For zap_cursor_init_serialized(), there are callers that do the iteration
   60  * outside of ZFS.  Typically they would iterate over everything, but we
   61  * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
   62  * zcp_snapshots_iter(), and other iterators over things in the MOS - these
   63  * are called by /sbin/zfs and channel programs.  The other example is
   64  * zfs_readdir() which iterates over directory entries for the getdents()
   65  * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
   66  * userland doesn't have to.
   67  *
   68  * Given that the ZAP entries aren't returned in a specific order, the only
   69  * legitimate use cases for partial iteration would be:
   70  *
   71  * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
   72  *    get the first 100 and then wait for the user to hit "next page", which
   73  *    they may never do).
   74  *
   75  * 2. You want to know if there are more than X entries, without relying on
   76  *    the zfs-specific implementation of the directory's st_size (which is
   77  *    the number of entries).
   78  */
   79 static int zap_iterate_prefetch = B_TRUE;
   80 
   81 int fzap_default_block_shift = 14; /* 16k blocksize */
   82 
   83 static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
   84 
   85 void
   86 fzap_byteswap(void *vbuf, size_t size)
   87 {
   88         uint64_t block_type = *(uint64_t *)vbuf;
   89 
   90         if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
   91                 zap_leaf_byteswap(vbuf, size);
   92         else {
   93                 /* it's a ptrtbl block */
   94                 byteswap_uint64_array(vbuf, size);
   95         }
   96 }
   97 
   98 void
   99 fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
  100 {
  101         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  102         zap->zap_ismicro = FALSE;
  103 
  104         zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
  105         zap->zap_dbu.dbu_evict_func_async = NULL;
  106 
  107         mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
  108         zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
  109 
  110         zap_phys_t *zp = zap_f_phys(zap);
  111         /*
  112          * explicitly zero it since it might be coming from an
  113          * initialized microzap
  114          */
  115         memset(zap->zap_dbuf->db_data, 0, zap->zap_dbuf->db_size);
  116         zp->zap_block_type = ZBT_HEADER;
  117         zp->zap_magic = ZAP_MAGIC;
  118 
  119         zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
  120 
  121         zp->zap_freeblk = 2;            /* block 1 will be the first leaf */
  122         zp->zap_num_leafs = 1;
  123         zp->zap_num_entries = 0;
  124         zp->zap_salt = zap->zap_salt;
  125         zp->zap_normflags = zap->zap_normflags;
  126         zp->zap_flags = flags;
  127 
  128         /* block 1 will be the first leaf */
  129         for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
  130                 ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
  131 
  132         /*
  133          * set up block 1 - the first leaf
  134          */
  135         dmu_buf_t *db;
  136         VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
  137             1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
  138         dmu_buf_will_dirty(db, tx);
  139 
  140         zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
  141         l->l_dbuf = db;
  142 
  143         zap_leaf_init(l, zp->zap_normflags != 0);
  144 
  145         kmem_free(l, sizeof (zap_leaf_t));
  146         dmu_buf_rele(db, FTAG);
  147 }
  148 
  149 static int
  150 zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
  151 {
  152         if (RW_WRITE_HELD(&zap->zap_rwlock))
  153                 return (1);
  154         if (rw_tryupgrade(&zap->zap_rwlock)) {
  155                 dmu_buf_will_dirty(zap->zap_dbuf, tx);
  156                 return (1);
  157         }
  158         return (0);
  159 }
  160 
  161 /*
  162  * Generic routines for dealing with the pointer & cookie tables.
  163  */
  164 
  165 static int
  166 zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
  167     void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
  168     dmu_tx_t *tx)
  169 {
  170         uint64_t newblk;
  171         int bs = FZAP_BLOCK_SHIFT(zap);
  172         int hepb = 1<<(bs-4);
  173         /* hepb = half the number of entries in a block */
  174 
  175         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  176         ASSERT(tbl->zt_blk != 0);
  177         ASSERT(tbl->zt_numblks > 0);
  178 
  179         if (tbl->zt_nextblk != 0) {
  180                 newblk = tbl->zt_nextblk;
  181         } else {
  182                 newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
  183                 tbl->zt_nextblk = newblk;
  184                 ASSERT0(tbl->zt_blks_copied);
  185                 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
  186                     tbl->zt_blk << bs, tbl->zt_numblks << bs,
  187                     ZIO_PRIORITY_SYNC_READ);
  188         }
  189 
  190         /*
  191          * Copy the ptrtbl from the old to new location.
  192          */
  193 
  194         uint64_t b = tbl->zt_blks_copied;
  195         dmu_buf_t *db_old;
  196         int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
  197             (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
  198         if (err != 0)
  199                 return (err);
  200 
  201         /* first half of entries in old[b] go to new[2*b+0] */
  202         dmu_buf_t *db_new;
  203         VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
  204             (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
  205         dmu_buf_will_dirty(db_new, tx);
  206         transfer_func(db_old->db_data, db_new->db_data, hepb);
  207         dmu_buf_rele(db_new, FTAG);
  208 
  209         /* second half of entries in old[b] go to new[2*b+1] */
  210         VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
  211             (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
  212         dmu_buf_will_dirty(db_new, tx);
  213         transfer_func((uint64_t *)db_old->db_data + hepb,
  214             db_new->db_data, hepb);
  215         dmu_buf_rele(db_new, FTAG);
  216 
  217         dmu_buf_rele(db_old, FTAG);
  218 
  219         tbl->zt_blks_copied++;
  220 
  221         dprintf("copied block %llu of %llu\n",
  222             (u_longlong_t)tbl->zt_blks_copied,
  223             (u_longlong_t)tbl->zt_numblks);
  224 
  225         if (tbl->zt_blks_copied == tbl->zt_numblks) {
  226                 (void) dmu_free_range(zap->zap_objset, zap->zap_object,
  227                     tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
  228 
  229                 tbl->zt_blk = newblk;
  230                 tbl->zt_numblks *= 2;
  231                 tbl->zt_shift++;
  232                 tbl->zt_nextblk = 0;
  233                 tbl->zt_blks_copied = 0;
  234 
  235                 dprintf("finished; numblocks now %llu (%uk entries)\n",
  236                     (u_longlong_t)tbl->zt_numblks, 1<<(tbl->zt_shift-10));
  237         }
  238 
  239         return (0);
  240 }
  241 
  242 static int
  243 zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
  244     dmu_tx_t *tx)
  245 {
  246         int bs = FZAP_BLOCK_SHIFT(zap);
  247 
  248         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  249         ASSERT(tbl->zt_blk != 0);
  250 
  251         dprintf("storing %llx at index %llx\n", (u_longlong_t)val,
  252             (u_longlong_t)idx);
  253 
  254         uint64_t blk = idx >> (bs-3);
  255         uint64_t off = idx & ((1<<(bs-3))-1);
  256 
  257         dmu_buf_t *db;
  258         int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
  259             (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
  260         if (err != 0)
  261                 return (err);
  262         dmu_buf_will_dirty(db, tx);
  263 
  264         if (tbl->zt_nextblk != 0) {
  265                 uint64_t idx2 = idx * 2;
  266                 uint64_t blk2 = idx2 >> (bs-3);
  267                 uint64_t off2 = idx2 & ((1<<(bs-3))-1);
  268                 dmu_buf_t *db2;
  269 
  270                 err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
  271                     (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
  272                     DMU_READ_NO_PREFETCH);
  273                 if (err != 0) {
  274                         dmu_buf_rele(db, FTAG);
  275                         return (err);
  276                 }
  277                 dmu_buf_will_dirty(db2, tx);
  278                 ((uint64_t *)db2->db_data)[off2] = val;
  279                 ((uint64_t *)db2->db_data)[off2+1] = val;
  280                 dmu_buf_rele(db2, FTAG);
  281         }
  282 
  283         ((uint64_t *)db->db_data)[off] = val;
  284         dmu_buf_rele(db, FTAG);
  285 
  286         return (0);
  287 }
  288 
  289 static int
  290 zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
  291 {
  292         int bs = FZAP_BLOCK_SHIFT(zap);
  293 
  294         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  295 
  296         uint64_t blk = idx >> (bs-3);
  297         uint64_t off = idx & ((1<<(bs-3))-1);
  298 
  299         /*
  300          * Note: this is equivalent to dmu_buf_hold(), but we use
  301          * _dnode_enter / _by_dnode because it's faster because we don't
  302          * have to hold the dnode.
  303          */
  304         dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
  305         dmu_buf_t *db;
  306         int err = dmu_buf_hold_by_dnode(dn,
  307             (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
  308         dmu_buf_dnode_exit(zap->zap_dbuf);
  309         if (err != 0)
  310                 return (err);
  311         *valp = ((uint64_t *)db->db_data)[off];
  312         dmu_buf_rele(db, FTAG);
  313 
  314         if (tbl->zt_nextblk != 0) {
  315                 /*
  316                  * read the nextblk for the sake of i/o error checking,
  317                  * so that zap_table_load() will catch errors for
  318                  * zap_table_store.
  319                  */
  320                 blk = (idx*2) >> (bs-3);
  321 
  322                 dn = dmu_buf_dnode_enter(zap->zap_dbuf);
  323                 err = dmu_buf_hold_by_dnode(dn,
  324                     (tbl->zt_nextblk + blk) << bs, FTAG, &db,
  325                     DMU_READ_NO_PREFETCH);
  326                 dmu_buf_dnode_exit(zap->zap_dbuf);
  327                 if (err == 0)
  328                         dmu_buf_rele(db, FTAG);
  329         }
  330         return (err);
  331 }
  332 
  333 /*
  334  * Routines for growing the ptrtbl.
  335  */
  336 
  337 static void
  338 zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
  339 {
  340         for (int i = 0; i < n; i++) {
  341                 uint64_t lb = src[i];
  342                 dst[2 * i + 0] = lb;
  343                 dst[2 * i + 1] = lb;
  344         }
  345 }
  346 
  347 static int
  348 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
  349 {
  350         /*
  351          * The pointer table should never use more hash bits than we
  352          * have (otherwise we'd be using useless zero bits to index it).
  353          * If we are within 2 bits of running out, stop growing, since
  354          * this is already an aberrant condition.
  355          */
  356         if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
  357                 return (SET_ERROR(ENOSPC));
  358 
  359         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
  360                 /*
  361                  * We are outgrowing the "embedded" ptrtbl (the one
  362                  * stored in the header block).  Give it its own entire
  363                  * block, which will double the size of the ptrtbl.
  364                  */
  365                 ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
  366                     ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
  367                 ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
  368 
  369                 uint64_t newblk = zap_allocate_blocks(zap, 1);
  370                 dmu_buf_t *db_new;
  371                 int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
  372                     newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
  373                     DMU_READ_NO_PREFETCH);
  374                 if (err != 0)
  375                         return (err);
  376                 dmu_buf_will_dirty(db_new, tx);
  377                 zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
  378                     db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
  379                 dmu_buf_rele(db_new, FTAG);
  380 
  381                 zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
  382                 zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
  383                 zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
  384 
  385                 ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
  386                     zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
  387                     (FZAP_BLOCK_SHIFT(zap)-3));
  388 
  389                 return (0);
  390         } else {
  391                 return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
  392                     zap_ptrtbl_transfer, tx));
  393         }
  394 }
  395 
  396 static void
  397 zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
  398 {
  399         dmu_buf_will_dirty(zap->zap_dbuf, tx);
  400         mutex_enter(&zap->zap_f.zap_num_entries_mtx);
  401         ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
  402         zap_f_phys(zap)->zap_num_entries += delta;
  403         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
  404 }
  405 
  406 static uint64_t
  407 zap_allocate_blocks(zap_t *zap, int nblocks)
  408 {
  409         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  410         uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
  411         zap_f_phys(zap)->zap_freeblk += nblocks;
  412         return (newblk);
  413 }
  414 
  415 static void
  416 zap_leaf_evict_sync(void *dbu)
  417 {
  418         zap_leaf_t *l = dbu;
  419 
  420         rw_destroy(&l->l_rwlock);
  421         kmem_free(l, sizeof (zap_leaf_t));
  422 }
  423 
  424 static zap_leaf_t *
  425 zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
  426 {
  427         zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
  428 
  429         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  430 
  431         rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
  432         rw_enter(&l->l_rwlock, RW_WRITER);
  433         l->l_blkid = zap_allocate_blocks(zap, 1);
  434         l->l_dbuf = NULL;
  435 
  436         VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
  437             l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
  438             DMU_READ_NO_PREFETCH));
  439         dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
  440         VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
  441         dmu_buf_will_dirty(l->l_dbuf, tx);
  442 
  443         zap_leaf_init(l, zap->zap_normflags != 0);
  444 
  445         zap_f_phys(zap)->zap_num_leafs++;
  446 
  447         return (l);
  448 }
  449 
  450 int
  451 fzap_count(zap_t *zap, uint64_t *count)
  452 {
  453         ASSERT(!zap->zap_ismicro);
  454         mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
  455         *count = zap_f_phys(zap)->zap_num_entries;
  456         mutex_exit(&zap->zap_f.zap_num_entries_mtx);
  457         return (0);
  458 }
  459 
  460 /*
  461  * Routines for obtaining zap_leaf_t's
  462  */
  463 
  464 void
  465 zap_put_leaf(zap_leaf_t *l)
  466 {
  467         rw_exit(&l->l_rwlock);
  468         dmu_buf_rele(l->l_dbuf, NULL);
  469 }
  470 
  471 static zap_leaf_t *
  472 zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
  473 {
  474         ASSERT(blkid != 0);
  475 
  476         zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
  477         rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
  478         rw_enter(&l->l_rwlock, RW_WRITER);
  479         l->l_blkid = blkid;
  480         l->l_bs = highbit64(db->db_size) - 1;
  481         l->l_dbuf = db;
  482 
  483         dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
  484         zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
  485 
  486         rw_exit(&l->l_rwlock);
  487         if (winner != NULL) {
  488                 /* someone else set it first */
  489                 zap_leaf_evict_sync(&l->l_dbu);
  490                 l = winner;
  491         }
  492 
  493         /*
  494          * lhr_pad was previously used for the next leaf in the leaf
  495          * chain.  There should be no chained leafs (as we have removed
  496          * support for them).
  497          */
  498         ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
  499 
  500         /*
  501          * There should be more hash entries than there can be
  502          * chunks to put in the hash table
  503          */
  504         ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
  505 
  506         /* The chunks should begin at the end of the hash table */
  507         ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
  508             &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
  509 
  510         /* The chunks should end at the end of the block */
  511         ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
  512             (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
  513 
  514         return (l);
  515 }
  516 
  517 static int
  518 zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
  519     zap_leaf_t **lp)
  520 {
  521         dmu_buf_t *db;
  522 
  523         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  524 
  525         /*
  526          * If system crashed just after dmu_free_long_range in zfs_rmnode, we
  527          * would be left with an empty xattr dir in delete queue. blkid=0
  528          * would be passed in when doing zfs_purgedir. If that's the case we
  529          * should just return immediately. The underlying objects should
  530          * already be freed, so this should be perfectly fine.
  531          */
  532         if (blkid == 0)
  533                 return (SET_ERROR(ENOENT));
  534 
  535         int bs = FZAP_BLOCK_SHIFT(zap);
  536         dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
  537         int err = dmu_buf_hold_by_dnode(dn,
  538             blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
  539         dmu_buf_dnode_exit(zap->zap_dbuf);
  540         if (err != 0)
  541                 return (err);
  542 
  543         ASSERT3U(db->db_object, ==, zap->zap_object);
  544         ASSERT3U(db->db_offset, ==, blkid << bs);
  545         ASSERT3U(db->db_size, ==, 1 << bs);
  546         ASSERT(blkid != 0);
  547 
  548         zap_leaf_t *l = dmu_buf_get_user(db);
  549 
  550         if (l == NULL)
  551                 l = zap_open_leaf(blkid, db);
  552 
  553         rw_enter(&l->l_rwlock, lt);
  554         /*
  555          * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
  556          * causing ASSERT below to fail.
  557          */
  558         if (lt == RW_WRITER)
  559                 dmu_buf_will_dirty(db, tx);
  560         ASSERT3U(l->l_blkid, ==, blkid);
  561         ASSERT3P(l->l_dbuf, ==, db);
  562         ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
  563         ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
  564 
  565         *lp = l;
  566         return (0);
  567 }
  568 
  569 static int
  570 zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
  571 {
  572         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  573 
  574         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
  575                 ASSERT3U(idx, <,
  576                     (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
  577                 *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
  578                 return (0);
  579         } else {
  580                 return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
  581                     idx, valp));
  582         }
  583 }
  584 
  585 static int
  586 zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
  587 {
  588         ASSERT(tx != NULL);
  589         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  590 
  591         if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
  592                 ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
  593                 return (0);
  594         } else {
  595                 return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
  596                     idx, blk, tx));
  597         }
  598 }
  599 
  600 static int
  601 zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
  602 {
  603         uint64_t blk;
  604 
  605         ASSERT(zap->zap_dbuf == NULL ||
  606             zap_f_phys(zap) == zap->zap_dbuf->db_data);
  607 
  608         /* Reality check for corrupt zap objects (leaf or header). */
  609         if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
  610             zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
  611             zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
  612                 return (SET_ERROR(EIO));
  613         }
  614 
  615         uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
  616         int err = zap_idx_to_blk(zap, idx, &blk);
  617         if (err != 0)
  618                 return (err);
  619         err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
  620 
  621         ASSERT(err ||
  622             ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
  623             zap_leaf_phys(*lp)->l_hdr.lh_prefix);
  624         return (err);
  625 }
  626 
  627 static int
  628 zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
  629     const void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
  630 {
  631         zap_t *zap = zn->zn_zap;
  632         uint64_t hash = zn->zn_hash;
  633         int err;
  634         int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
  635 
  636         ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
  637         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  638 
  639         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
  640             zap_leaf_phys(l)->l_hdr.lh_prefix);
  641 
  642         if (zap_tryupgradedir(zap, tx) == 0 ||
  643             old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
  644                 /* We failed to upgrade, or need to grow the pointer table */
  645                 objset_t *os = zap->zap_objset;
  646                 uint64_t object = zap->zap_object;
  647 
  648                 zap_put_leaf(l);
  649                 zap_unlockdir(zap, tag);
  650                 err = zap_lockdir(os, object, tx, RW_WRITER,
  651                     FALSE, FALSE, tag, &zn->zn_zap);
  652                 zap = zn->zn_zap;
  653                 if (err != 0)
  654                         return (err);
  655                 ASSERT(!zap->zap_ismicro);
  656 
  657                 while (old_prefix_len ==
  658                     zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
  659                         err = zap_grow_ptrtbl(zap, tx);
  660                         if (err != 0)
  661                                 return (err);
  662                 }
  663 
  664                 err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
  665                 if (err != 0)
  666                         return (err);
  667 
  668                 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
  669                         /* it split while our locks were down */
  670                         *lp = l;
  671                         return (0);
  672                 }
  673         }
  674         ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
  675         ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
  676         ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
  677             zap_leaf_phys(l)->l_hdr.lh_prefix);
  678 
  679         int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
  680             (old_prefix_len + 1);
  681         uint64_t sibling =
  682             (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
  683 
  684         /* check for i/o errors before doing zap_leaf_split */
  685         for (int i = 0; i < (1ULL << prefix_diff); i++) {
  686                 uint64_t blk;
  687                 err = zap_idx_to_blk(zap, sibling + i, &blk);
  688                 if (err != 0)
  689                         return (err);
  690                 ASSERT3U(blk, ==, l->l_blkid);
  691         }
  692 
  693         zap_leaf_t *nl = zap_create_leaf(zap, tx);
  694         zap_leaf_split(l, nl, zap->zap_normflags != 0);
  695 
  696         /* set sibling pointers */
  697         for (int i = 0; i < (1ULL << prefix_diff); i++) {
  698                 err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
  699                 ASSERT0(err); /* we checked for i/o errors above */
  700         }
  701 
  702         ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
  703 
  704         if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
  705                 /* we want the sibling */
  706                 zap_put_leaf(l);
  707                 *lp = nl;
  708         } else {
  709                 zap_put_leaf(nl);
  710                 *lp = l;
  711         }
  712 
  713         return (0);
  714 }
  715 
  716 static void
  717 zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
  718     const void *tag, dmu_tx_t *tx)
  719 {
  720         zap_t *zap = zn->zn_zap;
  721         int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
  722         int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
  723             zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
  724 
  725         zap_put_leaf(l);
  726 
  727         if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
  728                 /*
  729                  * We are in the middle of growing the pointer table, or
  730                  * this leaf will soon make us grow it.
  731                  */
  732                 if (zap_tryupgradedir(zap, tx) == 0) {
  733                         objset_t *os = zap->zap_objset;
  734                         uint64_t zapobj = zap->zap_object;
  735 
  736                         zap_unlockdir(zap, tag);
  737                         int err = zap_lockdir(os, zapobj, tx,
  738                             RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
  739                         zap = zn->zn_zap;
  740                         if (err != 0)
  741                                 return;
  742                 }
  743 
  744                 /* could have finished growing while our locks were down */
  745                 if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
  746                         (void) zap_grow_ptrtbl(zap, tx);
  747         }
  748 }
  749 
  750 static int
  751 fzap_checkname(zap_name_t *zn)
  752 {
  753         if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
  754                 return (SET_ERROR(ENAMETOOLONG));
  755         return (0);
  756 }
  757 
  758 static int
  759 fzap_checksize(uint64_t integer_size, uint64_t num_integers)
  760 {
  761         /* Only integer sizes supported by C */
  762         switch (integer_size) {
  763         case 1:
  764         case 2:
  765         case 4:
  766         case 8:
  767                 break;
  768         default:
  769                 return (SET_ERROR(EINVAL));
  770         }
  771 
  772         if (integer_size * num_integers > ZAP_MAXVALUELEN)
  773                 return (SET_ERROR(E2BIG));
  774 
  775         return (0);
  776 }
  777 
  778 static int
  779 fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
  780 {
  781         int err = fzap_checkname(zn);
  782         if (err != 0)
  783                 return (err);
  784         return (fzap_checksize(integer_size, num_integers));
  785 }
  786 
  787 /*
  788  * Routines for manipulating attributes.
  789  */
  790 int
  791 fzap_lookup(zap_name_t *zn,
  792     uint64_t integer_size, uint64_t num_integers, void *buf,
  793     char *realname, int rn_len, boolean_t *ncp)
  794 {
  795         zap_leaf_t *l;
  796         zap_entry_handle_t zeh;
  797 
  798         int err = fzap_checkname(zn);
  799         if (err != 0)
  800                 return (err);
  801 
  802         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
  803         if (err != 0)
  804                 return (err);
  805         err = zap_leaf_lookup(l, zn, &zeh);
  806         if (err == 0) {
  807                 if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
  808                         zap_put_leaf(l);
  809                         return (err);
  810                 }
  811 
  812                 err = zap_entry_read(&zeh, integer_size, num_integers, buf);
  813                 (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
  814                 if (ncp) {
  815                         *ncp = zap_entry_normalization_conflict(&zeh,
  816                             zn, NULL, zn->zn_zap);
  817                 }
  818         }
  819 
  820         zap_put_leaf(l);
  821         return (err);
  822 }
  823 
  824 int
  825 fzap_add_cd(zap_name_t *zn,
  826     uint64_t integer_size, uint64_t num_integers,
  827     const void *val, uint32_t cd, const void *tag, dmu_tx_t *tx)
  828 {
  829         zap_leaf_t *l;
  830         int err;
  831         zap_entry_handle_t zeh;
  832         zap_t *zap = zn->zn_zap;
  833 
  834         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  835         ASSERT(!zap->zap_ismicro);
  836         ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
  837 
  838         err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
  839         if (err != 0)
  840                 return (err);
  841 retry:
  842         err = zap_leaf_lookup(l, zn, &zeh);
  843         if (err == 0) {
  844                 err = SET_ERROR(EEXIST);
  845                 goto out;
  846         }
  847         if (err != ENOENT)
  848                 goto out;
  849 
  850         err = zap_entry_create(l, zn, cd,
  851             integer_size, num_integers, val, &zeh);
  852 
  853         if (err == 0) {
  854                 zap_increment_num_entries(zap, 1, tx);
  855         } else if (err == EAGAIN) {
  856                 err = zap_expand_leaf(zn, l, tag, tx, &l);
  857                 zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
  858                 if (err == 0) {
  859                         goto retry;
  860                 } else if (err == ENOSPC) {
  861                         /*
  862                          * If we failed to expand the leaf, then bailout
  863                          * as there is no point trying
  864                          * zap_put_leaf_maybe_grow_ptrtbl().
  865                          */
  866                         return (err);
  867                 }
  868         }
  869 
  870 out:
  871         if (zap != NULL)
  872                 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
  873         return (err);
  874 }
  875 
  876 int
  877 fzap_add(zap_name_t *zn,
  878     uint64_t integer_size, uint64_t num_integers,
  879     const void *val, const void *tag, dmu_tx_t *tx)
  880 {
  881         int err = fzap_check(zn, integer_size, num_integers);
  882         if (err != 0)
  883                 return (err);
  884 
  885         return (fzap_add_cd(zn, integer_size, num_integers,
  886             val, ZAP_NEED_CD, tag, tx));
  887 }
  888 
  889 int
  890 fzap_update(zap_name_t *zn,
  891     int integer_size, uint64_t num_integers, const void *val,
  892     const void *tag, dmu_tx_t *tx)
  893 {
  894         zap_leaf_t *l;
  895         int err;
  896         boolean_t create;
  897         zap_entry_handle_t zeh;
  898         zap_t *zap = zn->zn_zap;
  899 
  900         ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
  901         err = fzap_check(zn, integer_size, num_integers);
  902         if (err != 0)
  903                 return (err);
  904 
  905         err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
  906         if (err != 0)
  907                 return (err);
  908 retry:
  909         err = zap_leaf_lookup(l, zn, &zeh);
  910         create = (err == ENOENT);
  911         ASSERT(err == 0 || err == ENOENT);
  912 
  913         if (create) {
  914                 err = zap_entry_create(l, zn, ZAP_NEED_CD,
  915                     integer_size, num_integers, val, &zeh);
  916                 if (err == 0)
  917                         zap_increment_num_entries(zap, 1, tx);
  918         } else {
  919                 err = zap_entry_update(&zeh, integer_size, num_integers, val);
  920         }
  921 
  922         if (err == EAGAIN) {
  923                 err = zap_expand_leaf(zn, l, tag, tx, &l);
  924                 zap = zn->zn_zap;       /* zap_expand_leaf() may change zap */
  925                 if (err == 0)
  926                         goto retry;
  927         }
  928 
  929         if (zap != NULL)
  930                 zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
  931         return (err);
  932 }
  933 
  934 int
  935 fzap_length(zap_name_t *zn,
  936     uint64_t *integer_size, uint64_t *num_integers)
  937 {
  938         zap_leaf_t *l;
  939         int err;
  940         zap_entry_handle_t zeh;
  941 
  942         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
  943         if (err != 0)
  944                 return (err);
  945         err = zap_leaf_lookup(l, zn, &zeh);
  946         if (err != 0)
  947                 goto out;
  948 
  949         if (integer_size != NULL)
  950                 *integer_size = zeh.zeh_integer_size;
  951         if (num_integers != NULL)
  952                 *num_integers = zeh.zeh_num_integers;
  953 out:
  954         zap_put_leaf(l);
  955         return (err);
  956 }
  957 
  958 int
  959 fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
  960 {
  961         zap_leaf_t *l;
  962         int err;
  963         zap_entry_handle_t zeh;
  964 
  965         err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
  966         if (err != 0)
  967                 return (err);
  968         err = zap_leaf_lookup(l, zn, &zeh);
  969         if (err == 0) {
  970                 zap_entry_remove(&zeh);
  971                 zap_increment_num_entries(zn->zn_zap, -1, tx);
  972         }
  973         zap_put_leaf(l);
  974         return (err);
  975 }
  976 
  977 void
  978 fzap_prefetch(zap_name_t *zn)
  979 {
  980         uint64_t blk;
  981         zap_t *zap = zn->zn_zap;
  982 
  983         uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
  984             zap_f_phys(zap)->zap_ptrtbl.zt_shift);
  985         if (zap_idx_to_blk(zap, idx, &blk) != 0)
  986                 return;
  987         int bs = FZAP_BLOCK_SHIFT(zap);
  988         dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
  989             ZIO_PRIORITY_SYNC_READ);
  990 }
  991 
  992 /*
  993  * Helper functions for consumers.
  994  */
  995 
  996 uint64_t
  997 zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
  998     const char *name, dmu_tx_t *tx)
  999 {
 1000         return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
 1001 }
 1002 
 1003 uint64_t
 1004 zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
 1005     const char *name, int dnodesize, dmu_tx_t *tx)
 1006 {
 1007         uint64_t new_obj;
 1008 
 1009         new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx);
 1010         VERIFY(new_obj != 0);
 1011         VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
 1012             tx));
 1013 
 1014         return (new_obj);
 1015 }
 1016 
 1017 int
 1018 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
 1019     char *name)
 1020 {
 1021         zap_cursor_t zc;
 1022         int err;
 1023 
 1024         if (mask == 0)
 1025                 mask = -1ULL;
 1026 
 1027         zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 1028         for (zap_cursor_init(&zc, os, zapobj);
 1029             (err = zap_cursor_retrieve(&zc, za)) == 0;
 1030             zap_cursor_advance(&zc)) {
 1031                 if ((za->za_first_integer & mask) == (value & mask)) {
 1032                         (void) strlcpy(name, za->za_name, MAXNAMELEN);
 1033                         break;
 1034                 }
 1035         }
 1036         zap_cursor_fini(&zc);
 1037         kmem_free(za, sizeof (*za));
 1038         return (err);
 1039 }
 1040 
 1041 int
 1042 zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
 1043 {
 1044         zap_cursor_t zc;
 1045         int err = 0;
 1046 
 1047         zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 1048         for (zap_cursor_init(&zc, os, fromobj);
 1049             zap_cursor_retrieve(&zc, za) == 0;
 1050             (void) zap_cursor_advance(&zc)) {
 1051                 if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 1052                         err = SET_ERROR(EINVAL);
 1053                         break;
 1054                 }
 1055                 err = zap_add(os, intoobj, za->za_name,
 1056                     8, 1, &za->za_first_integer, tx);
 1057                 if (err != 0)
 1058                         break;
 1059         }
 1060         zap_cursor_fini(&zc);
 1061         kmem_free(za, sizeof (*za));
 1062         return (err);
 1063 }
 1064 
 1065 int
 1066 zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 1067     uint64_t value, dmu_tx_t *tx)
 1068 {
 1069         zap_cursor_t zc;
 1070         int err = 0;
 1071 
 1072         zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 1073         for (zap_cursor_init(&zc, os, fromobj);
 1074             zap_cursor_retrieve(&zc, za) == 0;
 1075             (void) zap_cursor_advance(&zc)) {
 1076                 if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 1077                         err = SET_ERROR(EINVAL);
 1078                         break;
 1079                 }
 1080                 err = zap_add(os, intoobj, za->za_name,
 1081                     8, 1, &value, tx);
 1082                 if (err != 0)
 1083                         break;
 1084         }
 1085         zap_cursor_fini(&zc);
 1086         kmem_free(za, sizeof (*za));
 1087         return (err);
 1088 }
 1089 
 1090 int
 1091 zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
 1092     dmu_tx_t *tx)
 1093 {
 1094         zap_cursor_t zc;
 1095         int err = 0;
 1096 
 1097         zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
 1098         for (zap_cursor_init(&zc, os, fromobj);
 1099             zap_cursor_retrieve(&zc, za) == 0;
 1100             (void) zap_cursor_advance(&zc)) {
 1101                 uint64_t delta = 0;
 1102 
 1103                 if (za->za_integer_length != 8 || za->za_num_integers != 1) {
 1104                         err = SET_ERROR(EINVAL);
 1105                         break;
 1106                 }
 1107 
 1108                 err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
 1109                 if (err != 0 && err != ENOENT)
 1110                         break;
 1111                 delta += za->za_first_integer;
 1112                 err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
 1113                 if (err != 0)
 1114                         break;
 1115         }
 1116         zap_cursor_fini(&zc);
 1117         kmem_free(za, sizeof (*za));
 1118         return (err);
 1119 }
 1120 
 1121 int
 1122 zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 1123 {
 1124         char name[20];
 1125 
 1126         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 1127         return (zap_add(os, obj, name, 8, 1, &value, tx));
 1128 }
 1129 
 1130 int
 1131 zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
 1132 {
 1133         char name[20];
 1134 
 1135         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 1136         return (zap_remove(os, obj, name, tx));
 1137 }
 1138 
 1139 int
 1140 zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 1141 {
 1142         char name[20];
 1143 
 1144         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
 1145         return (zap_lookup(os, obj, name, 8, 1, &value));
 1146 }
 1147 
 1148 int
 1149 zap_add_int_key(objset_t *os, uint64_t obj,
 1150     uint64_t key, uint64_t value, dmu_tx_t *tx)
 1151 {
 1152         char name[20];
 1153 
 1154         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 1155         return (zap_add(os, obj, name, 8, 1, &value, tx));
 1156 }
 1157 
 1158 int
 1159 zap_update_int_key(objset_t *os, uint64_t obj,
 1160     uint64_t key, uint64_t value, dmu_tx_t *tx)
 1161 {
 1162         char name[20];
 1163 
 1164         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 1165         return (zap_update(os, obj, name, 8, 1, &value, tx));
 1166 }
 1167 
 1168 int
 1169 zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
 1170 {
 1171         char name[20];
 1172 
 1173         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 1174         return (zap_lookup(os, obj, name, 8, 1, valuep));
 1175 }
 1176 
 1177 int
 1178 zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
 1179     dmu_tx_t *tx)
 1180 {
 1181         uint64_t value = 0;
 1182 
 1183         if (delta == 0)
 1184                 return (0);
 1185 
 1186         int err = zap_lookup(os, obj, name, 8, 1, &value);
 1187         if (err != 0 && err != ENOENT)
 1188                 return (err);
 1189         value += delta;
 1190         if (value == 0)
 1191                 err = zap_remove(os, obj, name, tx);
 1192         else
 1193                 err = zap_update(os, obj, name, 8, 1, &value, tx);
 1194         return (err);
 1195 }
 1196 
 1197 int
 1198 zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
 1199     dmu_tx_t *tx)
 1200 {
 1201         char name[20];
 1202 
 1203         (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
 1204         return (zap_increment(os, obj, name, delta, tx));
 1205 }
 1206 
 1207 /*
 1208  * Routines for iterating over the attributes.
 1209  */
 1210 
 1211 int
 1212 fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 1213 {
 1214         int err = ENOENT;
 1215         zap_entry_handle_t zeh;
 1216         zap_leaf_t *l;
 1217 
 1218         /* retrieve the next entry at or after zc_hash/zc_cd */
 1219         /* if no entry, return ENOENT */
 1220 
 1221         /*
 1222          * If we are reading from the beginning, we're almost certain to
 1223          * iterate over the entire ZAP object.  If there are multiple leaf
 1224          * blocks (freeblk > 2), prefetch the whole object (up to
 1225          * dmu_prefetch_max bytes), so that we read the leaf blocks
 1226          * concurrently. (Unless noprefetch was requested via
 1227          * zap_cursor_init_noprefetch()).
 1228          */
 1229         if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 1230             zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
 1231                 dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
 1232                     zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 1233                     ZIO_PRIORITY_ASYNC_READ);
 1234         }
 1235 
 1236         if (zc->zc_leaf &&
 1237             (ZAP_HASH_IDX(zc->zc_hash,
 1238             zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
 1239             zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
 1240                 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 1241                 zap_put_leaf(zc->zc_leaf);
 1242                 zc->zc_leaf = NULL;
 1243         }
 1244 
 1245 again:
 1246         if (zc->zc_leaf == NULL) {
 1247                 err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
 1248                     &zc->zc_leaf);
 1249                 if (err != 0)
 1250                         return (err);
 1251         } else {
 1252                 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
 1253         }
 1254         l = zc->zc_leaf;
 1255 
 1256         err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
 1257 
 1258         if (err == ENOENT) {
 1259                 if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
 1260                         zc->zc_hash = -1ULL;
 1261                         zc->zc_cd = 0;
 1262                 } else {
 1263                         uint64_t nocare = (1ULL <<
 1264                             (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
 1265 
 1266                         zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
 1267                         zc->zc_cd = 0;
 1268 
 1269                         if (zc->zc_hash == 0) {
 1270                                 zc->zc_hash = -1ULL;
 1271                         } else {
 1272                                 zap_put_leaf(zc->zc_leaf);
 1273                                 zc->zc_leaf = NULL;
 1274                                 goto again;
 1275                         }
 1276                 }
 1277         }
 1278 
 1279         if (err == 0) {
 1280                 zc->zc_hash = zeh.zeh_hash;
 1281                 zc->zc_cd = zeh.zeh_cd;
 1282                 za->za_integer_length = zeh.zeh_integer_size;
 1283                 za->za_num_integers = zeh.zeh_num_integers;
 1284                 if (zeh.zeh_num_integers == 0) {
 1285                         za->za_first_integer = 0;
 1286                 } else {
 1287                         err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
 1288                         ASSERT(err == 0 || err == EOVERFLOW);
 1289                 }
 1290                 err = zap_entry_read_name(zap, &zeh,
 1291                     sizeof (za->za_name), za->za_name);
 1292                 ASSERT(err == 0);
 1293 
 1294                 za->za_normalization_conflict =
 1295                     zap_entry_normalization_conflict(&zeh,
 1296                     NULL, za->za_name, zap);
 1297         }
 1298         rw_exit(&zc->zc_leaf->l_rwlock);
 1299         return (err);
 1300 }
 1301 
 1302 static void
 1303 zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 1304 {
 1305         uint64_t lastblk = 0;
 1306 
 1307         /*
 1308          * NB: if a leaf has more pointers than an entire ptrtbl block
 1309          * can hold, then it'll be accounted for more than once, since
 1310          * we won't have lastblk.
 1311          */
 1312         for (int i = 0; i < len; i++) {
 1313                 zap_leaf_t *l;
 1314 
 1315                 if (tbl[i] == lastblk)
 1316                         continue;
 1317                 lastblk = tbl[i];
 1318 
 1319                 int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
 1320                 if (err == 0) {
 1321                         zap_leaf_stats(zap, l, zs);
 1322                         zap_put_leaf(l);
 1323                 }
 1324         }
 1325 }
 1326 
 1327 void
 1328 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 1329 {
 1330         int bs = FZAP_BLOCK_SHIFT(zap);
 1331         zs->zs_blocksize = 1ULL << bs;
 1332 
 1333         /*
 1334          * Set zap_phys_t fields
 1335          */
 1336         zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
 1337         zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
 1338         zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
 1339         zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
 1340         zs->zs_magic = zap_f_phys(zap)->zap_magic;
 1341         zs->zs_salt = zap_f_phys(zap)->zap_salt;
 1342 
 1343         /*
 1344          * Set zap_ptrtbl fields
 1345          */
 1346         zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 1347         zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
 1348         zs->zs_ptrtbl_blks_copied =
 1349             zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
 1350         zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
 1351         zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 1352         zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
 1353 
 1354         if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
 1355                 /* the ptrtbl is entirely in the header block. */
 1356                 zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 1357                     1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 1358         } else {
 1359                 dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
 1360                     zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 1361                     zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 1362                     ZIO_PRIORITY_SYNC_READ);
 1363 
 1364                 for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
 1365                     b++) {
 1366                         dmu_buf_t *db;
 1367                         int err;
 1368 
 1369                         err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
 1370                             (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 1371                             FTAG, &db, DMU_READ_NO_PREFETCH);
 1372                         if (err == 0) {
 1373                                 zap_stats_ptrtbl(zap, db->db_data,
 1374                                     1<<(bs-3), zs);
 1375                                 dmu_buf_rele(db, FTAG);
 1376                         }
 1377                 }
 1378         }
 1379 }
 1380 
 1381 /* CSTYLED */
 1382 ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
 1383         "When iterating ZAP object, prefetch it");

Cache object: eaa87cf4eb1e87193d81869b75ea4dcd


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.