The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2007 Doug Rabson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/9.1/sys/boot/zfs/zfsimpl.c 237771 2012-06-29 10:30:59Z avg $");
   29 
   30 /*
   31  *      Stand-alone ZFS file reader.
   32  */
   33 
   34 #include <sys/stat.h>
   35 #include <sys/stdint.h>
   36 
   37 #include "zfsimpl.h"
   38 #include "zfssubr.c"
   39 
   40 
   41 struct zfsmount {
   42         const spa_t     *spa;
   43         objset_phys_t   objset;
   44         uint64_t        rootobj;
   45 };
   46 
   47 /*
   48  * List of all vdevs, chained through v_alllink.
   49  */
   50 static vdev_list_t zfs_vdevs;
   51 
   52 /*
   53  * List of all pools, chained through spa_link.
   54  */
   55 static spa_list_t zfs_pools;
   56 
   57 static uint64_t zfs_crc64_table[256];
   58 static const dnode_phys_t *dnode_cache_obj = 0;
   59 static uint64_t dnode_cache_bn;
   60 static char *dnode_cache_buf;
   61 static char *zap_scratch;
   62 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
   63 
   64 #define TEMP_SIZE       (1024 * 1024)
   65 
   66 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
   67 
   68 static void
   69 zfs_init(void)
   70 {
   71         STAILQ_INIT(&zfs_vdevs);
   72         STAILQ_INIT(&zfs_pools);
   73 
   74         zfs_temp_buf = malloc(TEMP_SIZE);
   75         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
   76         zfs_temp_ptr = zfs_temp_buf;
   77         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
   78         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
   79 
   80         zfs_init_crc();
   81 }
   82 
   83 static void *
   84 zfs_alloc(size_t size)
   85 {
   86         char *ptr;
   87 
   88         if (zfs_temp_ptr + size > zfs_temp_end) {
   89                 printf("ZFS: out of temporary buffer space\n");
   90                 for (;;) ;
   91         }
   92         ptr = zfs_temp_ptr;
   93         zfs_temp_ptr += size;
   94 
   95         return (ptr);
   96 }
   97 
   98 static void
   99 zfs_free(void *ptr, size_t size)
  100 {
  101 
  102         zfs_temp_ptr -= size;
  103         if (zfs_temp_ptr != ptr) {
  104                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
  105                 for (;;) ;
  106         }
  107 }
  108 
  109 static int
  110 xdr_int(const unsigned char **xdr, int *ip)
  111 {
  112         *ip = ((*xdr)[0] << 24)
  113                 | ((*xdr)[1] << 16)
  114                 | ((*xdr)[2] << 8)
  115                 | ((*xdr)[3] << 0);
  116         (*xdr) += 4;
  117         return (0);
  118 }
  119 
  120 static int
  121 xdr_u_int(const unsigned char **xdr, u_int *ip)
  122 {
  123         *ip = ((*xdr)[0] << 24)
  124                 | ((*xdr)[1] << 16)
  125                 | ((*xdr)[2] << 8)
  126                 | ((*xdr)[3] << 0);
  127         (*xdr) += 4;
  128         return (0);
  129 }
  130 
  131 static int
  132 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
  133 {
  134         u_int hi, lo;
  135 
  136         xdr_u_int(xdr, &hi);
  137         xdr_u_int(xdr, &lo);
  138         *lp = (((uint64_t) hi) << 32) | lo;
  139         return (0);
  140 }
  141 
  142 static int
  143 nvlist_find(const unsigned char *nvlist, const char *name, int type,
  144             int* elementsp, void *valuep)
  145 {
  146         const unsigned char *p, *pair;
  147         int junk;
  148         int encoded_size, decoded_size;
  149 
  150         p = nvlist;
  151         xdr_int(&p, &junk);
  152         xdr_int(&p, &junk);
  153 
  154         pair = p;
  155         xdr_int(&p, &encoded_size);
  156         xdr_int(&p, &decoded_size);
  157         while (encoded_size && decoded_size) {
  158                 int namelen, pairtype, elements;
  159                 const char *pairname;
  160 
  161                 xdr_int(&p, &namelen);
  162                 pairname = (const char*) p;
  163                 p += roundup(namelen, 4);
  164                 xdr_int(&p, &pairtype);
  165 
  166                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
  167                         xdr_int(&p, &elements);
  168                         if (elementsp)
  169                                 *elementsp = elements;
  170                         if (type == DATA_TYPE_UINT64) {
  171                                 xdr_uint64_t(&p, (uint64_t *) valuep);
  172                                 return (0);
  173                         } else if (type == DATA_TYPE_STRING) {
  174                                 int len;
  175                                 xdr_int(&p, &len);
  176                                 (*(const char**) valuep) = (const char*) p;
  177                                 return (0);
  178                         } else if (type == DATA_TYPE_NVLIST
  179                                    || type == DATA_TYPE_NVLIST_ARRAY) {
  180                                 (*(const unsigned char**) valuep) =
  181                                          (const unsigned char*) p;
  182                                 return (0);
  183                         } else {
  184                                 return (EIO);
  185                         }
  186                 } else {
  187                         /*
  188                          * Not the pair we are looking for, skip to the next one.
  189                          */
  190                         p = pair + encoded_size;
  191                 }
  192 
  193                 pair = p;
  194                 xdr_int(&p, &encoded_size);
  195                 xdr_int(&p, &decoded_size);
  196         }
  197 
  198         return (EIO);
  199 }
  200 
  201 /*
  202  * Return the next nvlist in an nvlist array.
  203  */
  204 static const unsigned char *
  205 nvlist_next(const unsigned char *nvlist)
  206 {
  207         const unsigned char *p, *pair;
  208         int junk;
  209         int encoded_size, decoded_size;
  210 
  211         p = nvlist;
  212         xdr_int(&p, &junk);
  213         xdr_int(&p, &junk);
  214 
  215         pair = p;
  216         xdr_int(&p, &encoded_size);
  217         xdr_int(&p, &decoded_size);
  218         while (encoded_size && decoded_size) {
  219                 p = pair + encoded_size;
  220 
  221                 pair = p;
  222                 xdr_int(&p, &encoded_size);
  223                 xdr_int(&p, &decoded_size);
  224         }
  225 
  226         return p;
  227 }
  228 
  229 #ifdef TEST
  230 
  231 static const unsigned char *
  232 nvlist_print(const unsigned char *nvlist, unsigned int indent)
  233 {
  234         static const char* typenames[] = {
  235                 "DATA_TYPE_UNKNOWN",
  236                 "DATA_TYPE_BOOLEAN",
  237                 "DATA_TYPE_BYTE",
  238                 "DATA_TYPE_INT16",
  239                 "DATA_TYPE_UINT16",
  240                 "DATA_TYPE_INT32",
  241                 "DATA_TYPE_UINT32",
  242                 "DATA_TYPE_INT64",
  243                 "DATA_TYPE_UINT64",
  244                 "DATA_TYPE_STRING",
  245                 "DATA_TYPE_BYTE_ARRAY",
  246                 "DATA_TYPE_INT16_ARRAY",
  247                 "DATA_TYPE_UINT16_ARRAY",
  248                 "DATA_TYPE_INT32_ARRAY",
  249                 "DATA_TYPE_UINT32_ARRAY",
  250                 "DATA_TYPE_INT64_ARRAY",
  251                 "DATA_TYPE_UINT64_ARRAY",
  252                 "DATA_TYPE_STRING_ARRAY",
  253                 "DATA_TYPE_HRTIME",
  254                 "DATA_TYPE_NVLIST",
  255                 "DATA_TYPE_NVLIST_ARRAY",
  256                 "DATA_TYPE_BOOLEAN_VALUE",
  257                 "DATA_TYPE_INT8",
  258                 "DATA_TYPE_UINT8",
  259                 "DATA_TYPE_BOOLEAN_ARRAY",
  260                 "DATA_TYPE_INT8_ARRAY",
  261                 "DATA_TYPE_UINT8_ARRAY"
  262         };
  263 
  264         unsigned int i, j;
  265         const unsigned char *p, *pair;
  266         int junk;
  267         int encoded_size, decoded_size;
  268 
  269         p = nvlist;
  270         xdr_int(&p, &junk);
  271         xdr_int(&p, &junk);
  272 
  273         pair = p;
  274         xdr_int(&p, &encoded_size);
  275         xdr_int(&p, &decoded_size);
  276         while (encoded_size && decoded_size) {
  277                 int namelen, pairtype, elements;
  278                 const char *pairname;
  279 
  280                 xdr_int(&p, &namelen);
  281                 pairname = (const char*) p;
  282                 p += roundup(namelen, 4);
  283                 xdr_int(&p, &pairtype);
  284 
  285                 for (i = 0; i < indent; i++)
  286                         printf(" ");
  287                 printf("%s %s", typenames[pairtype], pairname);
  288 
  289                 xdr_int(&p, &elements);
  290                 switch (pairtype) {
  291                 case DATA_TYPE_UINT64: {
  292                         uint64_t val;
  293                         xdr_uint64_t(&p, &val);
  294                         printf(" = 0x%jx\n", (uintmax_t)val);
  295                         break;
  296                 }
  297 
  298                 case DATA_TYPE_STRING: {
  299                         int len;
  300                         xdr_int(&p, &len);
  301                         printf(" = \"%s\"\n", p);
  302                         break;
  303                 }
  304 
  305                 case DATA_TYPE_NVLIST:
  306                         printf("\n");
  307                         nvlist_print(p, indent + 1);
  308                         break;
  309 
  310                 case DATA_TYPE_NVLIST_ARRAY:
  311                         for (j = 0; j < elements; j++) {
  312                                 printf("[%d]\n", j);
  313                                 p = nvlist_print(p, indent + 1);
  314                                 if (j != elements - 1) {
  315                                         for (i = 0; i < indent; i++)
  316                                                 printf(" ");
  317                                         printf("%s %s", typenames[pairtype], pairname);
  318                                 }
  319                         }
  320                         break;
  321 
  322                 default:
  323                         printf("\n");
  324                 }
  325 
  326                 p = pair + encoded_size;
  327 
  328                 pair = p;
  329                 xdr_int(&p, &encoded_size);
  330                 xdr_int(&p, &decoded_size);
  331         }
  332 
  333         return p;
  334 }
  335 
  336 #endif
  337 
  338 static int
  339 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
  340     off_t offset, size_t size)
  341 {
  342         size_t psize;
  343         int rc;
  344 
  345         if (!vdev->v_phys_read)
  346                 return (EIO);
  347 
  348         if (bp) {
  349                 psize = BP_GET_PSIZE(bp);
  350         } else {
  351                 psize = size;
  352         }
  353 
  354         /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
  355         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
  356         if (rc)
  357                 return (rc);
  358         if (bp && zio_checksum_verify(bp, buf))
  359                 return (EIO);
  360 
  361         return (0);
  362 }
  363 
  364 static int
  365 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  366     off_t offset, size_t bytes)
  367 {
  368 
  369         return (vdev_read_phys(vdev, bp, buf,
  370                 offset + VDEV_LABEL_START_SIZE, bytes));
  371 }
  372 
  373 
  374 static int
  375 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  376     off_t offset, size_t bytes)
  377 {
  378         vdev_t *kid;
  379         int rc;
  380 
  381         rc = EIO;
  382         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  383                 if (kid->v_state != VDEV_STATE_HEALTHY)
  384                         continue;
  385                 rc = kid->v_read(kid, bp, buf, offset, bytes);
  386                 if (!rc)
  387                         return (0);
  388         }
  389 
  390         return (rc);
  391 }
  392 
  393 static int
  394 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  395     off_t offset, size_t bytes)
  396 {
  397         vdev_t *kid;
  398 
  399         /*
  400          * Here we should have two kids:
  401          * First one which is the one we are replacing and we can trust
  402          * only this one to have valid data, but it might not be present.
  403          * Second one is that one we are replacing with. It is most likely
  404          * healthy, but we can't trust it has needed data, so we won't use it.
  405          */
  406         kid = STAILQ_FIRST(&vdev->v_children);
  407         if (kid == NULL)
  408                 return (EIO);
  409         if (kid->v_state != VDEV_STATE_HEALTHY)
  410                 return (EIO);
  411         return (kid->v_read(kid, bp, buf, offset, bytes));
  412 }
  413 
  414 static vdev_t *
  415 vdev_find(uint64_t guid)
  416 {
  417         vdev_t *vdev;
  418 
  419         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
  420                 if (vdev->v_guid == guid)
  421                         return (vdev);
  422 
  423         return (0);
  424 }
  425 
  426 static vdev_t *
  427 vdev_create(uint64_t guid, vdev_read_t *read)
  428 {
  429         vdev_t *vdev;
  430 
  431         vdev = malloc(sizeof(vdev_t));
  432         memset(vdev, 0, sizeof(vdev_t));
  433         STAILQ_INIT(&vdev->v_children);
  434         vdev->v_guid = guid;
  435         vdev->v_state = VDEV_STATE_OFFLINE;
  436         vdev->v_read = read;
  437         vdev->v_phys_read = 0;
  438         vdev->v_read_priv = 0;
  439         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
  440 
  441         return (vdev);
  442 }
  443 
  444 static int
  445 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
  446     vdev_t **vdevp, int is_newer)
  447 {
  448         int rc;
  449         uint64_t guid, id, ashift, nparity;
  450         const char *type;
  451         const char *path;
  452         vdev_t *vdev, *kid;
  453         const unsigned char *kids;
  454         int nkids, i, is_new;
  455         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
  456 
  457         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
  458                         DATA_TYPE_UINT64, 0, &guid)
  459             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
  460                            DATA_TYPE_UINT64, 0, &id)
  461             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
  462                            DATA_TYPE_STRING, 0, &type)) {
  463                 printf("ZFS: can't find vdev details\n");
  464                 return (ENOENT);
  465         }
  466 
  467         if (strcmp(type, VDEV_TYPE_MIRROR)
  468             && strcmp(type, VDEV_TYPE_DISK)
  469 #ifdef ZFS_TEST
  470             && strcmp(type, VDEV_TYPE_FILE)
  471 #endif
  472             && strcmp(type, VDEV_TYPE_RAIDZ)
  473             && strcmp(type, VDEV_TYPE_REPLACING)) {
  474                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  475                 return (EIO);
  476         }
  477 
  478         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
  479 
  480         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
  481                         &is_offline);
  482         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
  483                         &is_removed);
  484         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
  485                         &is_faulted);
  486         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
  487                         &is_degraded);
  488         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
  489                         &isnt_present);
  490 
  491         vdev = vdev_find(guid);
  492         if (!vdev) {
  493                 is_new = 1;
  494 
  495                 if (!strcmp(type, VDEV_TYPE_MIRROR))
  496                         vdev = vdev_create(guid, vdev_mirror_read);
  497                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
  498                         vdev = vdev_create(guid, vdev_raidz_read);
  499                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
  500                         vdev = vdev_create(guid, vdev_replacing_read);
  501                 else
  502                         vdev = vdev_create(guid, vdev_disk_read);
  503 
  504                 vdev->v_id = id;
  505                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
  506                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
  507                         DATA_TYPE_UINT64, 0, &ashift) == 0)
  508                         vdev->v_ashift = ashift;
  509                 else
  510                         vdev->v_ashift = 0;
  511                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
  512                         DATA_TYPE_UINT64, 0, &nparity) == 0)
  513                         vdev->v_nparity = nparity;
  514                 else
  515                         vdev->v_nparity = 0;
  516                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
  517                                 DATA_TYPE_STRING, 0, &path) == 0) {
  518                         if (strncmp(path, "/dev/", 5) == 0)
  519                                 path += 5;
  520                         vdev->v_name = strdup(path);
  521                 } else {
  522                         if (!strcmp(type, "raidz")) {
  523                                 if (vdev->v_nparity == 1)
  524                                         vdev->v_name = "raidz1";
  525                                 else if (vdev->v_nparity == 2)
  526                                         vdev->v_name = "raidz2";
  527                                 else if (vdev->v_nparity == 3)
  528                                         vdev->v_name = "raidz3";
  529                                 else {
  530                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  531                                         return (EIO);
  532                                 }
  533                         } else {
  534                                 vdev->v_name = strdup(type);
  535                         }
  536                 }
  537         } else {
  538                 is_new = 0;
  539         }
  540 
  541         if (is_new || is_newer) {
  542                 /*
  543                  * This is either new vdev or we've already seen this vdev,
  544                  * but from an older vdev label, so let's refresh its state
  545                  * from the newer label.
  546                  */
  547                 if (is_offline)
  548                         vdev->v_state = VDEV_STATE_OFFLINE;
  549                 else if (is_removed)
  550                         vdev->v_state = VDEV_STATE_REMOVED;
  551                 else if (is_faulted)
  552                         vdev->v_state = VDEV_STATE_FAULTED;
  553                 else if (is_degraded)
  554                         vdev->v_state = VDEV_STATE_DEGRADED;
  555                 else if (isnt_present)
  556                         vdev->v_state = VDEV_STATE_CANT_OPEN;
  557         }
  558 
  559         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
  560                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
  561         /*
  562          * Its ok if we don't have any kids.
  563          */
  564         if (rc == 0) {
  565                 vdev->v_nchildren = nkids;
  566                 for (i = 0; i < nkids; i++) {
  567                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
  568                         if (rc)
  569                                 return (rc);
  570                         if (is_new)
  571                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
  572                                                    v_childlink);
  573                         kids = nvlist_next(kids);
  574                 }
  575         } else {
  576                 vdev->v_nchildren = 0;
  577         }
  578 
  579         if (vdevp)
  580                 *vdevp = vdev;
  581         return (0);
  582 }
  583 
  584 static void
  585 vdev_set_state(vdev_t *vdev)
  586 {
  587         vdev_t *kid;
  588         int good_kids;
  589         int bad_kids;
  590 
  591         /*
  592          * A mirror or raidz is healthy if all its kids are healthy. A
  593          * mirror is degraded if any of its kids is healthy; a raidz
  594          * is degraded if at most nparity kids are offline.
  595          */
  596         if (STAILQ_FIRST(&vdev->v_children)) {
  597                 good_kids = 0;
  598                 bad_kids = 0;
  599                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  600                         if (kid->v_state == VDEV_STATE_HEALTHY)
  601                                 good_kids++;
  602                         else
  603                                 bad_kids++;
  604                 }
  605                 if (bad_kids == 0) {
  606                         vdev->v_state = VDEV_STATE_HEALTHY;
  607                 } else {
  608                         if (vdev->v_read == vdev_mirror_read) {
  609                                 if (good_kids) {
  610                                         vdev->v_state = VDEV_STATE_DEGRADED;
  611                                 } else {
  612                                         vdev->v_state = VDEV_STATE_OFFLINE;
  613                                 }
  614                         } else if (vdev->v_read == vdev_raidz_read) {
  615                                 if (bad_kids > vdev->v_nparity) {
  616                                         vdev->v_state = VDEV_STATE_OFFLINE;
  617                                 } else {
  618                                         vdev->v_state = VDEV_STATE_DEGRADED;
  619                                 }
  620                         }
  621                 }
  622         }
  623 }
  624 
  625 static spa_t *
  626 spa_find_by_guid(uint64_t guid)
  627 {
  628         spa_t *spa;
  629 
  630         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  631                 if (spa->spa_guid == guid)
  632                         return (spa);
  633 
  634         return (0);
  635 }
  636 
  637 static spa_t *
  638 spa_find_by_name(const char *name)
  639 {
  640         spa_t *spa;
  641 
  642         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  643                 if (!strcmp(spa->spa_name, name))
  644                         return (spa);
  645 
  646         return (0);
  647 }
  648 
  649 static spa_t *
  650 spa_create(uint64_t guid)
  651 {
  652         spa_t *spa;
  653 
  654         spa = malloc(sizeof(spa_t));
  655         memset(spa, 0, sizeof(spa_t));
  656         STAILQ_INIT(&spa->spa_vdevs);
  657         spa->spa_guid = guid;
  658         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
  659 
  660         return (spa);
  661 }
  662 
  663 static const char *
  664 state_name(vdev_state_t state)
  665 {
  666         static const char* names[] = {
  667                 "UNKNOWN",
  668                 "CLOSED",
  669                 "OFFLINE",
  670                 "REMOVED",
  671                 "CANT_OPEN",
  672                 "FAULTED",
  673                 "DEGRADED",
  674                 "ONLINE"
  675         };
  676         return names[state];
  677 }
  678 
  679 #ifdef BOOT2
  680 
  681 #define pager_printf printf
  682 
  683 #else
  684 
  685 static void
  686 pager_printf(const char *fmt, ...)
  687 {
  688         char line[80];
  689         va_list args;
  690 
  691         va_start(args, fmt);
  692         vsprintf(line, fmt, args);
  693         va_end(args);
  694         pager_output(line);
  695 }
  696 
  697 #endif
  698 
  699 #define STATUS_FORMAT   "        %s %s\n"
  700 
  701 static void
  702 print_state(int indent, const char *name, vdev_state_t state)
  703 {
  704         int i;
  705         char buf[512];
  706 
  707         buf[0] = 0;
  708         for (i = 0; i < indent; i++)
  709                 strcat(buf, "  ");
  710         strcat(buf, name);
  711         pager_printf(STATUS_FORMAT, buf, state_name(state));
  712         
  713 }
  714 
  715 static void
  716 vdev_status(vdev_t *vdev, int indent)
  717 {
  718         vdev_t *kid;
  719         print_state(indent, vdev->v_name, vdev->v_state);
  720 
  721         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  722                 vdev_status(kid, indent + 1);
  723         }
  724 }
  725 
  726 static void
  727 spa_status(spa_t *spa)
  728 {
  729         vdev_t *vdev;
  730         int good_kids, bad_kids, degraded_kids;
  731         vdev_state_t state;
  732 
  733         pager_printf("  pool: %s\n", spa->spa_name);
  734         pager_printf("config:\n\n");
  735         pager_printf(STATUS_FORMAT, "NAME", "STATE");
  736 
  737         good_kids = 0;
  738         degraded_kids = 0;
  739         bad_kids = 0;
  740         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  741                 if (vdev->v_state == VDEV_STATE_HEALTHY)
  742                         good_kids++;
  743                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
  744                         degraded_kids++;
  745                 else
  746                         bad_kids++;
  747         }
  748 
  749         state = VDEV_STATE_CLOSED;
  750         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
  751                 state = VDEV_STATE_HEALTHY;
  752         else if ((good_kids + degraded_kids) > 0)
  753                 state = VDEV_STATE_DEGRADED;
  754 
  755         print_state(0, spa->spa_name, state);
  756         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  757                 vdev_status(vdev, 1);
  758         }
  759 }
  760 
  761 static void
  762 spa_all_status(void)
  763 {
  764         spa_t *spa;
  765         int first = 1;
  766 
  767         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
  768                 if (!first)
  769                         pager_printf("\n");
  770                 first = 0;
  771                 spa_status(spa);
  772         }
  773 }
  774 
  775 static int
  776 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
  777 {
  778         vdev_t vtmp;
  779         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
  780         spa_t *spa;
  781         vdev_t *vdev, *top_vdev, *pool_vdev;
  782         off_t off;
  783         blkptr_t bp;
  784         const unsigned char *nvlist;
  785         uint64_t val;
  786         uint64_t guid;
  787         uint64_t pool_txg, pool_guid;
  788         uint64_t is_log;
  789         const char *pool_name;
  790         const unsigned char *vdevs;
  791         int i, rc, is_newer;
  792         char *upbuf;
  793         const struct uberblock *up;
  794 
  795         /*
  796          * Load the vdev label and figure out which
  797          * uberblock is most current.
  798          */
  799         memset(&vtmp, 0, sizeof(vtmp));
  800         vtmp.v_phys_read = read;
  801         vtmp.v_read_priv = read_priv;
  802         off = offsetof(vdev_label_t, vl_vdev_phys);
  803         BP_ZERO(&bp);
  804         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
  805         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
  806         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  807         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  808         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
  809         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  810         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
  811                 return (EIO);
  812 
  813         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
  814                 return (EIO);
  815         }
  816 
  817         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
  818 
  819         if (nvlist_find(nvlist,
  820                         ZPOOL_CONFIG_VERSION,
  821                         DATA_TYPE_UINT64, 0, &val)) {
  822                 return (EIO);
  823         }
  824 
  825         if (val > SPA_VERSION) {
  826                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
  827                     (unsigned) val, (unsigned) SPA_VERSION);
  828                 return (EIO);
  829         }
  830 
  831         if (nvlist_find(nvlist,
  832                         ZPOOL_CONFIG_POOL_STATE,
  833                         DATA_TYPE_UINT64, 0, &val)) {
  834                 return (EIO);
  835         }
  836 
  837         if (val == POOL_STATE_DESTROYED) {
  838                 /* We don't boot only from destroyed pools. */
  839                 return (EIO);
  840         }
  841 
  842         if (nvlist_find(nvlist,
  843                         ZPOOL_CONFIG_POOL_TXG,
  844                         DATA_TYPE_UINT64, 0, &pool_txg)
  845             || nvlist_find(nvlist,
  846                            ZPOOL_CONFIG_POOL_GUID,
  847                            DATA_TYPE_UINT64, 0, &pool_guid)
  848             || nvlist_find(nvlist,
  849                            ZPOOL_CONFIG_POOL_NAME,
  850                            DATA_TYPE_STRING, 0, &pool_name)) {
  851                 /*
  852                  * Cache and spare devices end up here - just ignore
  853                  * them.
  854                  */
  855                 /*printf("ZFS: can't find pool details\n");*/
  856                 return (EIO);
  857         }
  858 
  859         is_log = 0;
  860         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
  861             &is_log);
  862         if (is_log)
  863                 return (EIO);
  864 
  865         /*
  866          * Create the pool if this is the first time we've seen it.
  867          */
  868         spa = spa_find_by_guid(pool_guid);
  869         if (!spa) {
  870                 spa = spa_create(pool_guid);
  871                 spa->spa_name = strdup(pool_name);
  872         }
  873         if (pool_txg > spa->spa_txg) {
  874                 spa->spa_txg = pool_txg;
  875                 is_newer = 1;
  876         } else
  877                 is_newer = 0;
  878 
  879         /*
  880          * Get the vdev tree and create our in-core copy of it.
  881          * If we already have a vdev with this guid, this must
  882          * be some kind of alias (overlapping slices, dangerously dedicated
  883          * disks etc).
  884          */
  885         if (nvlist_find(nvlist,
  886                         ZPOOL_CONFIG_GUID,
  887                         DATA_TYPE_UINT64, 0, &guid)) {
  888                 return (EIO);
  889         }
  890         vdev = vdev_find(guid);
  891         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
  892                 return (EIO);
  893 
  894         if (nvlist_find(nvlist,
  895                         ZPOOL_CONFIG_VDEV_TREE,
  896                         DATA_TYPE_NVLIST, 0, &vdevs)) {
  897                 return (EIO);
  898         }
  899 
  900         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
  901         if (rc)
  902                 return (rc);
  903 
  904         /*
  905          * Add the toplevel vdev to the pool if its not already there.
  906          */
  907         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
  908                 if (top_vdev == pool_vdev)
  909                         break;
  910         if (!pool_vdev && top_vdev)
  911                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
  912 
  913         /*
  914          * We should already have created an incomplete vdev for this
  915          * vdev. Find it and initialise it with our read proc.
  916          */
  917         vdev = vdev_find(guid);
  918         if (vdev) {
  919                 vdev->v_phys_read = read;
  920                 vdev->v_read_priv = read_priv;
  921                 vdev->v_state = VDEV_STATE_HEALTHY;
  922         } else {
  923                 printf("ZFS: inconsistent nvlist contents\n");
  924                 return (EIO);
  925         }
  926 
  927         /*
  928          * Re-evaluate top-level vdev state.
  929          */
  930         vdev_set_state(top_vdev);
  931 
  932         /*
  933          * Ok, we are happy with the pool so far. Lets find
  934          * the best uberblock and then we can actually access
  935          * the contents of the pool.
  936          */
  937         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
  938         up = (const struct uberblock *)upbuf;
  939         for (i = 0;
  940              i < VDEV_UBERBLOCK_COUNT(vdev);
  941              i++) {
  942                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
  943                 BP_ZERO(&bp);
  944                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
  945                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  946                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  947                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  948                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  949                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  950 
  951                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
  952                         continue;
  953 
  954                 if (up->ub_magic != UBERBLOCK_MAGIC)
  955                         continue;
  956                 if (up->ub_txg < spa->spa_txg)
  957                         continue;
  958                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
  959                         spa->spa_uberblock = *up;
  960                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
  961                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
  962                                 spa->spa_uberblock = *up;
  963                 }
  964         }
  965         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
  966 
  967         if (spap)
  968                 *spap = spa;
  969         return (0);
  970 }
  971 
  972 static int
  973 ilog2(int n)
  974 {
  975         int v;
  976 
  977         for (v = 0; v < 32; v++)
  978                 if (n == (1 << v))
  979                         return v;
  980         return -1;
  981 }
  982 
  983 static int
  984 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
  985 {
  986         blkptr_t gbh_bp;
  987         zio_gbh_phys_t zio_gb;
  988         char *pbuf;
  989         int i;
  990 
  991         /* Artificial BP for gang block header. */
  992         gbh_bp = *bp;
  993         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  994         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  995         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
  996         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
  997         for (i = 0; i < SPA_DVAS_PER_BP; i++)
  998                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
  999 
 1000         /* Read gang header block using the artificial BP. */
 1001         if (zio_read(spa, &gbh_bp, &zio_gb))
 1002                 return (EIO);
 1003 
 1004         pbuf = buf;
 1005         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 1006                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
 1007 
 1008                 if (BP_IS_HOLE(gbp))
 1009                         continue;
 1010                 if (zio_read(spa, gbp, pbuf))
 1011                         return (EIO);
 1012                 pbuf += BP_GET_PSIZE(gbp);
 1013         }
 1014 
 1015         if (zio_checksum_verify(bp, buf))
 1016                 return (EIO);
 1017         return (0);
 1018 }
 1019 
 1020 static int
 1021 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
 1022 {
 1023         int cpfunc = BP_GET_COMPRESS(bp);
 1024         uint64_t align, size;
 1025         void *pbuf;
 1026         int i, error;
 1027 
 1028         error = EIO;
 1029 
 1030         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 1031                 const dva_t *dva = &bp->blk_dva[i];
 1032                 vdev_t *vdev;
 1033                 int vdevid;
 1034                 off_t offset;
 1035 
 1036                 if (!dva->dva_word[0] && !dva->dva_word[1])
 1037                         continue;
 1038 
 1039                 vdevid = DVA_GET_VDEV(dva);
 1040                 offset = DVA_GET_OFFSET(dva);
 1041                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 1042                         if (vdev->v_id == vdevid)
 1043                                 break;
 1044                 }
 1045                 if (!vdev || !vdev->v_read)
 1046                         continue;
 1047 
 1048                 size = BP_GET_PSIZE(bp);
 1049                 if (vdev->v_read == vdev_raidz_read) {
 1050                         align = 1ULL << vdev->v_top->v_ashift;
 1051                         if (P2PHASE(size, align) != 0)
 1052                                 size = P2ROUNDUP(size, align);
 1053                 }
 1054                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
 1055                         pbuf = zfs_alloc(size);
 1056                 else
 1057                         pbuf = buf;
 1058 
 1059                 if (DVA_GET_GANG(dva))
 1060                         error = zio_read_gang(spa, bp, pbuf);
 1061                 else
 1062                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
 1063                 if (error == 0) {
 1064                         if (cpfunc != ZIO_COMPRESS_OFF)
 1065                                 error = zio_decompress_data(cpfunc, pbuf,
 1066                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
 1067                         else if (size != BP_GET_PSIZE(bp))
 1068                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 1069                 }
 1070                 if (buf != pbuf)
 1071                         zfs_free(pbuf, size);
 1072                 if (error == 0)
 1073                         break;
 1074         }
 1075         if (error != 0)
 1076                 printf("ZFS: i/o error - all block copies unavailable\n");
 1077         return (error);
 1078 }
 1079 
 1080 static int
 1081 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
 1082 {
 1083         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 1084         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1085         int nlevels = dnode->dn_nlevels;
 1086         int i, rc;
 1087 
 1088         /*
 1089          * Note: bsize may not be a power of two here so we need to do an
 1090          * actual divide rather than a bitshift.
 1091          */
 1092         while (buflen > 0) {
 1093                 uint64_t bn = offset / bsize;
 1094                 int boff = offset % bsize;
 1095                 int ibn;
 1096                 const blkptr_t *indbp;
 1097                 blkptr_t bp;
 1098 
 1099                 if (bn > dnode->dn_maxblkid)
 1100                         return (EIO);
 1101 
 1102                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 1103                         goto cached;
 1104 
 1105                 indbp = dnode->dn_blkptr;
 1106                 for (i = 0; i < nlevels; i++) {
 1107                         /*
 1108                          * Copy the bp from the indirect array so that
 1109                          * we can re-use the scratch buffer for multi-level
 1110                          * objects.
 1111                          */
 1112                         ibn = bn >> ((nlevels - i - 1) * ibshift);
 1113                         ibn &= ((1 << ibshift) - 1);
 1114                         bp = indbp[ibn];
 1115                         rc = zio_read(spa, &bp, dnode_cache_buf);
 1116                         if (rc)
 1117                                 return (rc);
 1118                         indbp = (const blkptr_t *) dnode_cache_buf;
 1119                 }
 1120                 dnode_cache_obj = dnode;
 1121                 dnode_cache_bn = bn;
 1122         cached:
 1123 
 1124                 /*
 1125                  * The buffer contains our data block. Copy what we
 1126                  * need from it and loop.
 1127                  */ 
 1128                 i = bsize - boff;
 1129                 if (i > buflen) i = buflen;
 1130                 memcpy(buf, &dnode_cache_buf[boff], i);
 1131                 buf = ((char*) buf) + i;
 1132                 offset += i;
 1133                 buflen -= i;
 1134         }
 1135 
 1136         return (0);
 1137 }
 1138 
 1139 /*
 1140  * Lookup a value in a microzap directory. Assumes that the zap
 1141  * scratch buffer contains the directory contents.
 1142  */
 1143 static int
 1144 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1145 {
 1146         const mzap_phys_t *mz;
 1147         const mzap_ent_phys_t *mze;
 1148         size_t size;
 1149         int chunks, i;
 1150 
 1151         /*
 1152          * Microzap objects use exactly one block. Read the whole
 1153          * thing.
 1154          */
 1155         size = dnode->dn_datablkszsec * 512;
 1156 
 1157         mz = (const mzap_phys_t *) zap_scratch;
 1158         chunks = size / MZAP_ENT_LEN - 1;
 1159 
 1160         for (i = 0; i < chunks; i++) {
 1161                 mze = &mz->mz_chunk[i];
 1162                 if (!strcmp(mze->mze_name, name)) {
 1163                         *value = mze->mze_value;
 1164                         return (0);
 1165                 }
 1166         }
 1167 
 1168         return (ENOENT);
 1169 }
 1170 
 1171 /*
 1172  * Compare a name with a zap leaf entry. Return non-zero if the name
 1173  * matches.
 1174  */
 1175 static int
 1176 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 1177 {
 1178         size_t namelen;
 1179         const zap_leaf_chunk_t *nc;
 1180         const char *p;
 1181 
 1182         namelen = zc->l_entry.le_name_length;
 1183                         
 1184         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1185         p = name;
 1186         while (namelen > 0) {
 1187                 size_t len;
 1188                 len = namelen;
 1189                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1190                         len = ZAP_LEAF_ARRAY_BYTES;
 1191                 if (memcmp(p, nc->l_array.la_array, len))
 1192                         return (0);
 1193                 p += len;
 1194                 namelen -= len;
 1195                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1196         }
 1197 
 1198         return 1;
 1199 }
 1200 
 1201 /*
 1202  * Extract a uint64_t value from a zap leaf entry.
 1203  */
 1204 static uint64_t
 1205 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 1206 {
 1207         const zap_leaf_chunk_t *vc;
 1208         int i;
 1209         uint64_t value;
 1210         const uint8_t *p;
 1211 
 1212         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 1213         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 1214                 value = (value << 8) | p[i];
 1215         }
 1216 
 1217         return value;
 1218 }
 1219 
 1220 /*
 1221  * Lookup a value in a fatzap directory. Assumes that the zap scratch
 1222  * buffer contains the directory header.
 1223  */
 1224 static int
 1225 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1226 {
 1227         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1228         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1229         fat_zap_t z;
 1230         uint64_t *ptrtbl;
 1231         uint64_t hash;
 1232         int rc;
 1233 
 1234         if (zh.zap_magic != ZAP_MAGIC)
 1235                 return (EIO);
 1236 
 1237         z.zap_block_shift = ilog2(bsize);
 1238         z.zap_phys = (zap_phys_t *) zap_scratch;
 1239 
 1240         /*
 1241          * Figure out where the pointer table is and read it in if necessary.
 1242          */
 1243         if (zh.zap_ptrtbl.zt_blk) {
 1244                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1245                                zap_scratch, bsize);
 1246                 if (rc)
 1247                         return (rc);
 1248                 ptrtbl = (uint64_t *) zap_scratch;
 1249         } else {
 1250                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1251         }
 1252 
 1253         hash = zap_hash(zh.zap_salt, name);
 1254 
 1255         zap_leaf_t zl;
 1256         zl.l_bs = z.zap_block_shift;
 1257 
 1258         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1259         zap_leaf_chunk_t *zc;
 1260 
 1261         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1262         if (rc)
 1263                 return (rc);
 1264 
 1265         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1266 
 1267         /*
 1268          * Make sure this chunk matches our hash.
 1269          */
 1270         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1271             && zl.l_phys->l_hdr.lh_prefix
 1272             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1273                 return (ENOENT);
 1274 
 1275         /*
 1276          * Hash within the chunk to find our entry.
 1277          */
 1278         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1279         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1280         h = zl.l_phys->l_hash[h];
 1281         if (h == 0xffff)
 1282                 return (ENOENT);
 1283         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1284         while (zc->l_entry.le_hash != hash) {
 1285                 if (zc->l_entry.le_next == 0xffff) {
 1286                         zc = 0;
 1287                         break;
 1288                 }
 1289                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1290         }
 1291         if (fzap_name_equal(&zl, zc, name)) {
 1292                 *value = fzap_leaf_value(&zl, zc);
 1293                 return (0);
 1294         }
 1295 
 1296         return (ENOENT);
 1297 }
 1298 
 1299 /*
 1300  * Lookup a name in a zap object and return its value as a uint64_t.
 1301  */
 1302 static int
 1303 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1304 {
 1305         int rc;
 1306         uint64_t zap_type;
 1307         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1308 
 1309         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1310         if (rc)
 1311                 return (rc);
 1312 
 1313         zap_type = *(uint64_t *) zap_scratch;
 1314         if (zap_type == ZBT_MICRO)
 1315                 return mzap_lookup(dnode, name, value);
 1316         else if (zap_type == ZBT_HEADER)
 1317                 return fzap_lookup(spa, dnode, name, value);
 1318         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
 1319         return (EIO);
 1320 }
 1321 
 1322 #ifdef BOOT2
 1323 
 1324 /*
 1325  * List a microzap directory. Assumes that the zap scratch buffer contains
 1326  * the directory contents.
 1327  */
 1328 static int
 1329 mzap_list(const dnode_phys_t *dnode)
 1330 {
 1331         const mzap_phys_t *mz;
 1332         const mzap_ent_phys_t *mze;
 1333         size_t size;
 1334         int chunks, i;
 1335 
 1336         /*
 1337          * Microzap objects use exactly one block. Read the whole
 1338          * thing.
 1339          */
 1340         size = dnode->dn_datablkszsec * 512;
 1341         mz = (const mzap_phys_t *) zap_scratch;
 1342         chunks = size / MZAP_ENT_LEN - 1;
 1343 
 1344         for (i = 0; i < chunks; i++) {
 1345                 mze = &mz->mz_chunk[i];
 1346                 if (mze->mze_name[0])
 1347                         //printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value);
 1348                         printf("%s\n", mze->mze_name);
 1349         }
 1350 
 1351         return (0);
 1352 }
 1353 
 1354 /*
 1355  * List a fatzap directory. Assumes that the zap scratch buffer contains
 1356  * the directory header.
 1357  */
 1358 static int
 1359 fzap_list(const spa_t *spa, const dnode_phys_t *dnode)
 1360 {
 1361         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1362         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1363         fat_zap_t z;
 1364         int i, j;
 1365 
 1366         if (zh.zap_magic != ZAP_MAGIC)
 1367                 return (EIO);
 1368 
 1369         z.zap_block_shift = ilog2(bsize);
 1370         z.zap_phys = (zap_phys_t *) zap_scratch;
 1371 
 1372         /*
 1373          * This assumes that the leaf blocks start at block 1. The
 1374          * documentation isn't exactly clear on this.
 1375          */
 1376         zap_leaf_t zl;
 1377         zl.l_bs = z.zap_block_shift;
 1378         for (i = 0; i < zh.zap_num_leafs; i++) {
 1379                 off_t off = (i + 1) << zl.l_bs;
 1380                 char name[256], *p;
 1381                 uint64_t value;
 1382 
 1383                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 1384                         return (EIO);
 1385 
 1386                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1387 
 1388                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 1389                         zap_leaf_chunk_t *zc, *nc;
 1390                         int namelen;
 1391 
 1392                         zc = &ZAP_LEAF_CHUNK(&zl, j);
 1393                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 1394                                 continue;
 1395                         namelen = zc->l_entry.le_name_length;
 1396                         if (namelen > sizeof(name))
 1397                                 namelen = sizeof(name);
 1398 
 1399                         /*
 1400                          * Paste the name back together.
 1401                          */
 1402                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 1403                         p = name;
 1404                         while (namelen > 0) {
 1405                                 int len;
 1406                                 len = namelen;
 1407                                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1408                                         len = ZAP_LEAF_ARRAY_BYTES;
 1409                                 memcpy(p, nc->l_array.la_array, len);
 1410                                 p += len;
 1411                                 namelen -= len;
 1412                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 1413                         }
 1414 
 1415                         /*
 1416                          * Assume the first eight bytes of the value are
 1417                          * a uint64_t.
 1418                          */
 1419                         value = fzap_leaf_value(&zl, zc);
 1420 
 1421                         printf("%s 0x%jx\n", name, (uintmax_t)value);
 1422                 }
 1423         }
 1424 
 1425         return (0);
 1426 }
 1427 
 1428 /*
 1429  * List a zap directory.
 1430  */
 1431 static int
 1432 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
 1433 {
 1434         uint64_t zap_type;
 1435         size_t size = dnode->dn_datablkszsec * 512;
 1436 
 1437         if (dnode_read(spa, dnode, 0, zap_scratch, size))
 1438                 return (EIO);
 1439 
 1440         zap_type = *(uint64_t *) zap_scratch;
 1441         if (zap_type == ZBT_MICRO)
 1442                 return mzap_list(dnode);
 1443         else
 1444                 return fzap_list(spa, dnode);
 1445 }
 1446 
 1447 #endif
 1448 
 1449 static int
 1450 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 1451 {
 1452         off_t offset;
 1453 
 1454         offset = objnum * sizeof(dnode_phys_t);
 1455         return dnode_read(spa, &os->os_meta_dnode, offset,
 1456                 dnode, sizeof(dnode_phys_t));
 1457 }
 1458 
 1459 static int
 1460 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 1461 {
 1462         const mzap_phys_t *mz;
 1463         const mzap_ent_phys_t *mze;
 1464         size_t size;
 1465         int chunks, i;
 1466 
 1467         /*
 1468          * Microzap objects use exactly one block. Read the whole
 1469          * thing.
 1470          */
 1471         size = dnode->dn_datablkszsec * 512;
 1472 
 1473         mz = (const mzap_phys_t *) zap_scratch;
 1474         chunks = size / MZAP_ENT_LEN - 1;
 1475 
 1476         for (i = 0; i < chunks; i++) {
 1477                 mze = &mz->mz_chunk[i];
 1478                 if (value == mze->mze_value) {
 1479                         strcpy(name, mze->mze_name);
 1480                         return (0);
 1481                 }
 1482         }
 1483 
 1484         return (ENOENT);
 1485 }
 1486 
 1487 static void
 1488 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
 1489 {
 1490         size_t namelen;
 1491         const zap_leaf_chunk_t *nc;
 1492         char *p;
 1493 
 1494         namelen = zc->l_entry.le_name_length;
 1495 
 1496         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1497         p = name;
 1498         while (namelen > 0) {
 1499                 size_t len;
 1500                 len = namelen;
 1501                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1502                         len = ZAP_LEAF_ARRAY_BYTES;
 1503                 memcpy(p, nc->l_array.la_array, len);
 1504                 p += len;
 1505                 namelen -= len;
 1506                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1507         }
 1508 
 1509         *p = '\0';
 1510 }
 1511 
 1512 static int
 1513 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 1514 {
 1515         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1516         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1517         fat_zap_t z;
 1518         uint64_t *ptrtbl;
 1519         uint64_t hash;
 1520         int rc;
 1521 
 1522         if (zh.zap_magic != ZAP_MAGIC)
 1523                 return (EIO);
 1524 
 1525         z.zap_block_shift = ilog2(bsize);
 1526         z.zap_phys = (zap_phys_t *) zap_scratch;
 1527 
 1528         /*
 1529          * Figure out where the pointer table is and read it in if necessary.
 1530          */
 1531         if (zh.zap_ptrtbl.zt_blk) {
 1532                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1533                                zap_scratch, bsize);
 1534                 if (rc)
 1535                         return (rc);
 1536                 ptrtbl = (uint64_t *) zap_scratch;
 1537         } else {
 1538                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1539         }
 1540 
 1541         hash = zap_hash(zh.zap_salt, name);
 1542 
 1543         zap_leaf_t zl;
 1544         zl.l_bs = z.zap_block_shift;
 1545 
 1546         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1547         zap_leaf_chunk_t *zc;
 1548 
 1549         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1550         if (rc)
 1551                 return (rc);
 1552 
 1553         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1554 
 1555         /*
 1556          * Make sure this chunk matches our hash.
 1557          */
 1558         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1559             && zl.l_phys->l_hdr.lh_prefix
 1560             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1561                 return (ENOENT);
 1562 
 1563         /*
 1564          * Hash within the chunk to find our entry.
 1565          */
 1566         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1567         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1568         h = zl.l_phys->l_hash[h];
 1569         if (h == 0xffff)
 1570                 return (ENOENT);
 1571         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1572         while (zc->l_entry.le_hash != hash) {
 1573                 if (zc->l_entry.le_next == 0xffff) {
 1574                         zc = 0;
 1575                         break;
 1576                 }
 1577                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1578         }
 1579         if (fzap_leaf_value(&zl, zc) == value) {
 1580                 fzap_name_copy(&zl, zc, name);
 1581                 return (0);
 1582         }
 1583 
 1584         return (ENOENT);
 1585 }
 1586 
 1587 static int
 1588 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
 1589 {
 1590         int rc;
 1591         uint64_t zap_type;
 1592         size_t size = dnode->dn_datablkszsec * 512;
 1593 
 1594         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1595         if (rc)
 1596                 return (rc);
 1597 
 1598         zap_type = *(uint64_t *) zap_scratch;
 1599         if (zap_type == ZBT_MICRO)
 1600                 return mzap_rlookup(spa, dnode, name, value);
 1601         else
 1602                 return fzap_rlookup(spa, dnode, name, value);
 1603 }
 1604 
 1605 static int
 1606 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
 1607 {
 1608         char name[256];
 1609         char component[256];
 1610         uint64_t dir_obj, parent_obj, child_dir_zapobj;
 1611         dnode_phys_t child_dir_zap, dataset, dir, parent;
 1612         dsl_dir_phys_t *dd;
 1613         dsl_dataset_phys_t *ds;
 1614         char *p;
 1615         int len;
 1616 
 1617         p = &name[sizeof(name) - 1];
 1618         *p = '\0';
 1619 
 1620         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1621                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 1622                 return (EIO);
 1623         }
 1624         ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
 1625         dir_obj = ds->ds_dir_obj;
 1626 
 1627         for (;;) {
 1628                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
 1629                         return (EIO);
 1630                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 1631 
 1632                 /* Actual loop condition. */
 1633                 parent_obj  = dd->dd_parent_obj;
 1634                 if (parent_obj == 0)
 1635                         break;
 1636 
 1637                 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
 1638                         return (EIO);
 1639                 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
 1640                 child_dir_zapobj = dd->dd_child_dir_zapobj;
 1641                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
 1642                         return (EIO);
 1643                 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
 1644                         return (EIO);
 1645 
 1646                 len = strlen(component);
 1647                 p -= len;
 1648                 memcpy(p, component, len);
 1649                 --p;
 1650                 *p = '/';
 1651 
 1652                 /* Actual loop iteration. */
 1653                 dir_obj = parent_obj;
 1654         }
 1655 
 1656         if (*p != '\0')
 1657                 ++p;
 1658         strcpy(result, p);
 1659 
 1660         return (0);
 1661 }
 1662 
 1663 static int
 1664 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
 1665 {
 1666         char element[256];
 1667         uint64_t dir_obj, child_dir_zapobj;
 1668         dnode_phys_t child_dir_zap, dir;
 1669         dsl_dir_phys_t *dd;
 1670         const char *p, *q;
 1671 
 1672         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
 1673                 return (EIO);
 1674         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj))
 1675                 return (EIO);
 1676 
 1677         p = name;
 1678         for (;;) {
 1679                 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
 1680                         return (EIO);
 1681                 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
 1682 
 1683                 while (*p == '/')
 1684                         p++;
 1685                 /* Actual loop condition #1. */
 1686                 if (*p == '\0')
 1687                         break;
 1688 
 1689                 q = strchr(p, '/');
 1690                 if (q) {
 1691                         memcpy(element, p, q - p);
 1692                         element[q - p] = '\0';
 1693                         p = q + 1;
 1694                 } else {
 1695                         strcpy(element, p);
 1696                         p += strlen(p);
 1697                 }
 1698 
 1699                 child_dir_zapobj = dd->dd_child_dir_zapobj;
 1700                 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
 1701                         return (EIO);
 1702 
 1703                 /* Actual loop condition #2. */
 1704                 if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0)
 1705                         return (ENOENT);
 1706         }
 1707 
 1708         *objnum = dd->dd_head_dataset_obj;
 1709         return (0);
 1710 }
 1711 
 1712 /*
 1713  * Find the object set given the object number of its dataset object
 1714  * and return its details in *objset
 1715  */
 1716 static int
 1717 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 1718 {
 1719         dnode_phys_t dataset;
 1720         dsl_dataset_phys_t *ds;
 1721 
 1722         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1723                 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
 1724                 return (EIO);
 1725         }
 1726 
 1727         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 1728         if (zio_read(spa, &ds->ds_bp, objset)) {
 1729                 printf("ZFS: can't read object set for dataset %ju\n",
 1730                     (uintmax_t)objnum);
 1731                 return (EIO);
 1732         }
 1733 
 1734         return (0);
 1735 }
 1736 
 1737 /*
 1738  * Find the object set pointed to by the BOOTFS property or the root
 1739  * dataset if there is none and return its details in *objset
 1740  */
 1741 static int
 1742 zfs_get_root(const spa_t *spa, uint64_t *objid)
 1743 {
 1744         dnode_phys_t dir, propdir;
 1745         uint64_t props, bootfs, root;
 1746 
 1747         *objid = 0;
 1748 
 1749         /*
 1750          * Start with the MOS directory object.
 1751          */
 1752         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 1753                 printf("ZFS: can't read MOS object directory\n");
 1754                 return (EIO);
 1755         }
 1756 
 1757         /*
 1758          * Lookup the pool_props and see if we can find a bootfs.
 1759          */
 1760         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
 1761              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 1762              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
 1763              && bootfs != 0)
 1764         {
 1765                 *objid = bootfs;
 1766                 return (0);
 1767         }
 1768         /*
 1769          * Lookup the root dataset directory
 1770          */
 1771         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
 1772             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 1773                 printf("ZFS: can't find root dsl_dir\n");
 1774                 return (EIO);
 1775         }
 1776 
 1777         /*
 1778          * Use the information from the dataset directory's bonus buffer
 1779          * to find the dataset object and from that the object set itself.
 1780          */
 1781         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 1782         *objid = dd->dd_head_dataset_obj;
 1783         return (0);
 1784 }
 1785 
 1786 static int
 1787 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
 1788 {
 1789 
 1790         mount->spa = spa;
 1791 
 1792         /*
 1793          * Find the root object set if not explicitly provided
 1794          */
 1795         if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
 1796                 printf("ZFS: can't find root filesystem\n");
 1797                 return (EIO);
 1798         }
 1799 
 1800         if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
 1801                 printf("ZFS: can't open root filesystem\n");
 1802                 return (EIO);
 1803         }
 1804 
 1805         mount->rootobj = rootobj;
 1806 
 1807         return (0);
 1808 }
 1809 
 1810 static int
 1811 zfs_spa_init(spa_t *spa)
 1812 {
 1813 
 1814         if (spa->spa_inited)
 1815                 return (0);
 1816         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 1817                 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
 1818                 return (EIO);
 1819         }
 1820         spa->spa_inited = 1;
 1821         return (0);
 1822 }
 1823 
 1824 static int
 1825 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
 1826 {
 1827 
 1828         if (dn->dn_bonustype != DMU_OT_SA) {
 1829                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
 1830 
 1831                 sb->st_mode = zp->zp_mode;
 1832                 sb->st_uid = zp->zp_uid;
 1833                 sb->st_gid = zp->zp_gid;
 1834                 sb->st_size = zp->zp_size;
 1835         } else {
 1836                 sa_hdr_phys_t *sahdrp;
 1837                 int hdrsize;
 1838                 size_t size = 0;
 1839                 void *buf = NULL;
 1840 
 1841                 if (dn->dn_bonuslen != 0)
 1842                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
 1843                 else {
 1844                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
 1845                                 blkptr_t *bp = &dn->dn_spill;
 1846                                 int error;
 1847 
 1848                                 size = BP_GET_LSIZE(bp);
 1849                                 buf = zfs_alloc(size);
 1850                                 error = zio_read(spa, bp, buf);
 1851                                 if (error != 0) {
 1852                                         zfs_free(buf, size);
 1853                                         return (error);
 1854                                 }
 1855                                 sahdrp = buf;
 1856                         } else {
 1857                                 return (EIO);
 1858                         }
 1859                 }
 1860                 hdrsize = SA_HDR_SIZE(sahdrp);
 1861                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
 1862                     SA_MODE_OFFSET);
 1863                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1864                     SA_UID_OFFSET);
 1865                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1866                     SA_GID_OFFSET);
 1867                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
 1868                     SA_SIZE_OFFSET);
 1869                 if (buf != NULL)
 1870                         zfs_free(buf, size);
 1871         }
 1872 
 1873         return (0);
 1874 }
 1875 
 1876 /*
 1877  * Lookup a file and return its dnode.
 1878  */
 1879 static int
 1880 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
 1881 {
 1882         int rc;
 1883         uint64_t objnum, rootnum, parentnum;
 1884         const spa_t *spa;
 1885         dnode_phys_t dn;
 1886         const char *p, *q;
 1887         char element[256];
 1888         char path[1024];
 1889         int symlinks_followed = 0;
 1890         struct stat sb;
 1891 
 1892         spa = mount->spa;
 1893         if (mount->objset.os_type != DMU_OST_ZFS) {
 1894                 printf("ZFS: unexpected object set type %ju\n",
 1895                     (uintmax_t)mount->objset.os_type);
 1896                 return (EIO);
 1897         }
 1898 
 1899         /*
 1900          * Get the root directory dnode.
 1901          */
 1902         rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
 1903         if (rc)
 1904                 return (rc);
 1905 
 1906         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
 1907         if (rc)
 1908                 return (rc);
 1909 
 1910         rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
 1911         if (rc)
 1912                 return (rc);
 1913 
 1914         objnum = rootnum;
 1915         p = upath;
 1916         while (p && *p) {
 1917                 while (*p == '/')
 1918                         p++;
 1919                 if (!*p)
 1920                         break;
 1921                 q = strchr(p, '/');
 1922                 if (q) {
 1923                         memcpy(element, p, q - p);
 1924                         element[q - p] = 0;
 1925                         p = q;
 1926                 } else {
 1927                         strcpy(element, p);
 1928                         p = 0;
 1929                 }
 1930 
 1931                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1932                 if (rc)
 1933                         return (rc);
 1934                 if (!S_ISDIR(sb.st_mode))
 1935                         return (ENOTDIR);
 1936 
 1937                 parentnum = objnum;
 1938                 rc = zap_lookup(spa, &dn, element, &objnum);
 1939                 if (rc)
 1940                         return (rc);
 1941                 objnum = ZFS_DIRENT_OBJ(objnum);
 1942 
 1943                 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
 1944                 if (rc)
 1945                         return (rc);
 1946 
 1947                 /*
 1948                  * Check for symlink.
 1949                  */
 1950                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1951                 if (rc)
 1952                         return (rc);
 1953                 if (S_ISLNK(sb.st_mode)) {
 1954                         if (symlinks_followed > 10)
 1955                                 return (EMLINK);
 1956                         symlinks_followed++;
 1957 
 1958                         /*
 1959                          * Read the link value and copy the tail of our
 1960                          * current path onto the end.
 1961                          */
 1962                         if (p)
 1963                                 strcpy(&path[sb.st_size], p);
 1964                         else
 1965                                 path[sb.st_size] = 0;
 1966                         if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
 1967                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 1968                                         sb.st_size);
 1969                         } else {
 1970                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
 1971                                 if (rc)
 1972                                         return (rc);
 1973                         }
 1974 
 1975                         /*
 1976                          * Restart with the new path, starting either at
 1977                          * the root or at the parent depending whether or
 1978                          * not the link is relative.
 1979                          */
 1980                         p = path;
 1981                         if (*p == '/')
 1982                                 objnum = rootnum;
 1983                         else
 1984                                 objnum = parentnum;
 1985                         objset_get_dnode(spa, &mount->objset, objnum, &dn);
 1986                 }
 1987         }
 1988 
 1989         *dnode = dn;
 1990         return (0);
 1991 }

Cache object: b472af29f844b55bc01721a51eec7b10


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.