The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2007 Doug Rabson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/8.1/sys/boot/zfs/zfsimpl.c 208892 2010-06-07 13:37:13Z avg $");
   29 
   30 /*
   31  *      Stand-alone ZFS file reader.
   32  */
   33 
   34 #include "zfsimpl.h"
   35 #include "zfssubr.c"
   36 
   37 /*
   38  * List of all vdevs, chained through v_alllink.
   39  */
   40 static vdev_list_t zfs_vdevs;
   41 
   42 /*
   43  * List of all pools, chained through spa_link.
   44  */
   45 static spa_list_t zfs_pools;
   46 
   47 static uint64_t zfs_crc64_table[256];
   48 static const dnode_phys_t *dnode_cache_obj = 0;
   49 static uint64_t dnode_cache_bn;
   50 static char *dnode_cache_buf;
   51 static char *zap_scratch;
   52 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
   53 
   54 #define TEMP_SIZE       (1024 * 1024)
   55 
   56 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
   57 
   58 static void
   59 zfs_init(void)
   60 {
   61         STAILQ_INIT(&zfs_vdevs);
   62         STAILQ_INIT(&zfs_pools);
   63 
   64         zfs_temp_buf = malloc(TEMP_SIZE);
   65         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
   66         zfs_temp_ptr = zfs_temp_buf;
   67         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
   68         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
   69 
   70         zfs_init_crc();
   71 }
   72 
   73 static char *
   74 zfs_alloc_temp(size_t sz)
   75 {
   76         char *p;
   77 
   78         if (zfs_temp_ptr + sz > zfs_temp_end) {
   79                 printf("ZFS: out of temporary buffer space\n");
   80                 for (;;) ;
   81         }
   82         p = zfs_temp_ptr;
   83         zfs_temp_ptr += sz;
   84 
   85         return (p);
   86 }
   87 
   88 static void
   89 zfs_reset_temp(void)
   90 {
   91 
   92         zfs_temp_ptr = zfs_temp_buf;
   93 }
   94 
   95 static int
   96 xdr_int(const unsigned char **xdr, int *ip)
   97 {
   98         *ip = ((*xdr)[0] << 24)
   99                 | ((*xdr)[1] << 16)
  100                 | ((*xdr)[2] << 8)
  101                 | ((*xdr)[3] << 0);
  102         (*xdr) += 4;
  103         return (0);
  104 }
  105 
  106 static int
  107 xdr_u_int(const unsigned char **xdr, u_int *ip)
  108 {
  109         *ip = ((*xdr)[0] << 24)
  110                 | ((*xdr)[1] << 16)
  111                 | ((*xdr)[2] << 8)
  112                 | ((*xdr)[3] << 0);
  113         (*xdr) += 4;
  114         return (0);
  115 }
  116 
  117 static int
  118 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
  119 {
  120         u_int hi, lo;
  121 
  122         xdr_u_int(xdr, &hi);
  123         xdr_u_int(xdr, &lo);
  124         *lp = (((uint64_t) hi) << 32) | lo;
  125         return (0);
  126 }
  127 
  128 static int
  129 nvlist_find(const unsigned char *nvlist, const char *name, int type,
  130             int* elementsp, void *valuep)
  131 {
  132         const unsigned char *p, *pair;
  133         int junk;
  134         int encoded_size, decoded_size;
  135 
  136         p = nvlist;
  137         xdr_int(&p, &junk);
  138         xdr_int(&p, &junk);
  139 
  140         pair = p;
  141         xdr_int(&p, &encoded_size);
  142         xdr_int(&p, &decoded_size);
  143         while (encoded_size && decoded_size) {
  144                 int namelen, pairtype, elements;
  145                 const char *pairname;
  146 
  147                 xdr_int(&p, &namelen);
  148                 pairname = (const char*) p;
  149                 p += roundup(namelen, 4);
  150                 xdr_int(&p, &pairtype);
  151 
  152                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
  153                         xdr_int(&p, &elements);
  154                         if (elementsp)
  155                                 *elementsp = elements;
  156                         if (type == DATA_TYPE_UINT64) {
  157                                 xdr_uint64_t(&p, (uint64_t *) valuep);
  158                                 return (0);
  159                         } else if (type == DATA_TYPE_STRING) {
  160                                 int len;
  161                                 xdr_int(&p, &len);
  162                                 (*(const char**) valuep) = (const char*) p;
  163                                 return (0);
  164                         } else if (type == DATA_TYPE_NVLIST
  165                                    || type == DATA_TYPE_NVLIST_ARRAY) {
  166                                 (*(const unsigned char**) valuep) =
  167                                          (const unsigned char*) p;
  168                                 return (0);
  169                         } else {
  170                                 return (EIO);
  171                         }
  172                 } else {
  173                         /*
  174                          * Not the pair we are looking for, skip to the next one.
  175                          */
  176                         p = pair + encoded_size;
  177                 }
  178 
  179                 pair = p;
  180                 xdr_int(&p, &encoded_size);
  181                 xdr_int(&p, &decoded_size);
  182         }
  183 
  184         return (EIO);
  185 }
  186 
  187 /*
  188  * Return the next nvlist in an nvlist array.
  189  */
  190 static const unsigned char *
  191 nvlist_next(const unsigned char *nvlist)
  192 {
  193         const unsigned char *p, *pair;
  194         int junk;
  195         int encoded_size, decoded_size;
  196 
  197         p = nvlist;
  198         xdr_int(&p, &junk);
  199         xdr_int(&p, &junk);
  200 
  201         pair = p;
  202         xdr_int(&p, &encoded_size);
  203         xdr_int(&p, &decoded_size);
  204         while (encoded_size && decoded_size) {
  205                 p = pair + encoded_size;
  206 
  207                 pair = p;
  208                 xdr_int(&p, &encoded_size);
  209                 xdr_int(&p, &decoded_size);
  210         }
  211 
  212         return p;
  213 }
  214 
  215 #ifdef TEST
  216 
  217 static const unsigned char *
  218 nvlist_print(const unsigned char *nvlist, unsigned int indent)
  219 {
  220         static const char* typenames[] = {
  221                 "DATA_TYPE_UNKNOWN",
  222                 "DATA_TYPE_BOOLEAN",
  223                 "DATA_TYPE_BYTE",
  224                 "DATA_TYPE_INT16",
  225                 "DATA_TYPE_UINT16",
  226                 "DATA_TYPE_INT32",
  227                 "DATA_TYPE_UINT32",
  228                 "DATA_TYPE_INT64",
  229                 "DATA_TYPE_UINT64",
  230                 "DATA_TYPE_STRING",
  231                 "DATA_TYPE_BYTE_ARRAY",
  232                 "DATA_TYPE_INT16_ARRAY",
  233                 "DATA_TYPE_UINT16_ARRAY",
  234                 "DATA_TYPE_INT32_ARRAY",
  235                 "DATA_TYPE_UINT32_ARRAY",
  236                 "DATA_TYPE_INT64_ARRAY",
  237                 "DATA_TYPE_UINT64_ARRAY",
  238                 "DATA_TYPE_STRING_ARRAY",
  239                 "DATA_TYPE_HRTIME",
  240                 "DATA_TYPE_NVLIST",
  241                 "DATA_TYPE_NVLIST_ARRAY",
  242                 "DATA_TYPE_BOOLEAN_VALUE",
  243                 "DATA_TYPE_INT8",
  244                 "DATA_TYPE_UINT8",
  245                 "DATA_TYPE_BOOLEAN_ARRAY",
  246                 "DATA_TYPE_INT8_ARRAY",
  247                 "DATA_TYPE_UINT8_ARRAY"
  248         };
  249 
  250         unsigned int i, j;
  251         const unsigned char *p, *pair;
  252         int junk;
  253         int encoded_size, decoded_size;
  254 
  255         p = nvlist;
  256         xdr_int(&p, &junk);
  257         xdr_int(&p, &junk);
  258 
  259         pair = p;
  260         xdr_int(&p, &encoded_size);
  261         xdr_int(&p, &decoded_size);
  262         while (encoded_size && decoded_size) {
  263                 int namelen, pairtype, elements;
  264                 const char *pairname;
  265 
  266                 xdr_int(&p, &namelen);
  267                 pairname = (const char*) p;
  268                 p += roundup(namelen, 4);
  269                 xdr_int(&p, &pairtype);
  270 
  271                 for (i = 0; i < indent; i++)
  272                         printf(" ");
  273                 printf("%s %s", typenames[pairtype], pairname);
  274 
  275                 xdr_int(&p, &elements);
  276                 switch (pairtype) {
  277                 case DATA_TYPE_UINT64: {
  278                         uint64_t val;
  279                         xdr_uint64_t(&p, &val);
  280                         printf(" = 0x%llx\n", val);
  281                         break;
  282                 }
  283 
  284                 case DATA_TYPE_STRING: {
  285                         int len;
  286                         xdr_int(&p, &len);
  287                         printf(" = \"%s\"\n", p);
  288                         break;
  289                 }
  290 
  291                 case DATA_TYPE_NVLIST:
  292                         printf("\n");
  293                         nvlist_print(p, indent + 1);
  294                         break;
  295 
  296                 case DATA_TYPE_NVLIST_ARRAY:
  297                         for (j = 0; j < elements; j++) {
  298                                 printf("[%d]\n", j);
  299                                 p = nvlist_print(p, indent + 1);
  300                                 if (j != elements - 1) {
  301                                         for (i = 0; i < indent; i++)
  302                                                 printf(" ");
  303                                         printf("%s %s", typenames[pairtype], pairname);
  304                                 }
  305                         }
  306                         break;
  307 
  308                 default:
  309                         printf("\n");
  310                 }
  311 
  312                 p = pair + encoded_size;
  313 
  314                 pair = p;
  315                 xdr_int(&p, &encoded_size);
  316                 xdr_int(&p, &decoded_size);
  317         }
  318 
  319         return p;
  320 }
  321 
  322 #endif
  323 
  324 static int
  325 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
  326     off_t offset, size_t size)
  327 {
  328         size_t psize;
  329         int rc;
  330 
  331         if (bp) {
  332                 psize = BP_GET_PSIZE(bp);
  333         } else {
  334                 psize = size;
  335         }
  336 
  337         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
  338         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
  339         if (rc)
  340                 return (rc);
  341         if (bp && zio_checksum_error(bp, buf))
  342                 return (EIO);
  343 
  344         return (0);
  345 }
  346 
  347 static int
  348 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  349     off_t offset, size_t bytes)
  350 {
  351 
  352         return (vdev_read_phys(vdev, bp, buf,
  353                 offset + VDEV_LABEL_START_SIZE, bytes));
  354 }
  355 
  356 
  357 static int
  358 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  359     off_t offset, size_t bytes)
  360 {
  361         vdev_t *kid;
  362         int rc;
  363 
  364         rc = EIO;
  365         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  366                 if (kid->v_state != VDEV_STATE_HEALTHY)
  367                         continue;
  368                 rc = kid->v_read(kid, bp, buf, offset, bytes);
  369                 if (!rc)
  370                         return (0);
  371         }
  372 
  373         return (rc);
  374 }
  375 
  376 static vdev_t *
  377 vdev_find(uint64_t guid)
  378 {
  379         vdev_t *vdev;
  380 
  381         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
  382                 if (vdev->v_guid == guid)
  383                         return (vdev);
  384 
  385         return (0);
  386 }
  387 
  388 static vdev_t *
  389 vdev_create(uint64_t guid, vdev_read_t *read)
  390 {
  391         vdev_t *vdev;
  392 
  393         vdev = malloc(sizeof(vdev_t));
  394         memset(vdev, 0, sizeof(vdev_t));
  395         STAILQ_INIT(&vdev->v_children);
  396         vdev->v_guid = guid;
  397         vdev->v_state = VDEV_STATE_OFFLINE;
  398         vdev->v_read = read;
  399         vdev->v_phys_read = 0;
  400         vdev->v_read_priv = 0;
  401         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
  402 
  403         return (vdev);
  404 }
  405 
  406 static int
  407 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
  408 {
  409         int rc;
  410         uint64_t guid, id, ashift, nparity;
  411         const char *type;
  412         const char *path;
  413         vdev_t *vdev, *kid;
  414         const unsigned char *kids;
  415         int nkids, i, is_new;
  416         uint64_t is_offline, is_faulted, is_degraded, is_removed;
  417 
  418         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
  419                         DATA_TYPE_UINT64, 0, &guid)
  420             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
  421                            DATA_TYPE_UINT64, 0, &id)
  422             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
  423                            DATA_TYPE_STRING, 0, &type)) {
  424                 printf("ZFS: can't find vdev details\n");
  425                 return (ENOENT);
  426         }
  427 
  428         if (strcmp(type, VDEV_TYPE_MIRROR)
  429             && strcmp(type, VDEV_TYPE_DISK)
  430             && strcmp(type, VDEV_TYPE_RAIDZ)) {
  431                 printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
  432                 return (EIO);
  433         }
  434 
  435         is_offline = is_removed = is_faulted = is_degraded = 0;
  436 
  437         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
  438                         &is_offline);
  439         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
  440                         &is_removed);
  441         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
  442                         &is_faulted);
  443         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
  444                         &is_degraded);
  445 
  446         vdev = vdev_find(guid);
  447         if (!vdev) {
  448                 is_new = 1;
  449 
  450                 if (!strcmp(type, VDEV_TYPE_MIRROR))
  451                         vdev = vdev_create(guid, vdev_mirror_read);
  452                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
  453                         vdev = vdev_create(guid, vdev_raidz_read);
  454                 else
  455                         vdev = vdev_create(guid, vdev_disk_read);
  456 
  457                 vdev->v_id = id;
  458                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
  459                         DATA_TYPE_UINT64, 0, &ashift) == 0)
  460                         vdev->v_ashift = ashift;
  461                 else
  462                         vdev->v_ashift = 0;
  463                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
  464                         DATA_TYPE_UINT64, 0, &nparity) == 0)
  465                         vdev->v_nparity = nparity;
  466                 else
  467                         vdev->v_nparity = 0;
  468                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
  469                                 DATA_TYPE_STRING, 0, &path) == 0) {
  470                         if (strlen(path) > 5
  471                             && path[0] == '/'
  472                             && path[1] == 'd'
  473                             && path[2] == 'e'
  474                             && path[3] == 'v'
  475                             && path[4] == '/')
  476                                 path += 5;
  477                         vdev->v_name = strdup(path);
  478                 } else {
  479                         if (!strcmp(type, "raidz")) {
  480                                 if (vdev->v_nparity == 1)
  481                                         vdev->v_name = "raidz1";
  482                                 else
  483                                         vdev->v_name = "raidz2";
  484                         } else {
  485                                 vdev->v_name = strdup(type);
  486                         }
  487                 }
  488 
  489                 if (is_offline)
  490                         vdev->v_state = VDEV_STATE_OFFLINE;
  491                 else if (is_removed)
  492                         vdev->v_state = VDEV_STATE_REMOVED;
  493                 else if (is_faulted)
  494                         vdev->v_state = VDEV_STATE_FAULTED;
  495                 else if (is_degraded)
  496                         vdev->v_state = VDEV_STATE_DEGRADED;
  497                 else
  498                         vdev->v_state = VDEV_STATE_HEALTHY;
  499         } else {
  500                 is_new = 0;
  501 
  502                 if (is_newer) {
  503                         /*
  504                          * We've already seen this vdev, but from an older
  505                          * vdev label, so let's refresh its state from the
  506                          * newer label.
  507                          */
  508                         if (is_offline)
  509                                 vdev->v_state = VDEV_STATE_OFFLINE;
  510                         else if (is_removed)
  511                                 vdev->v_state = VDEV_STATE_REMOVED;
  512                         else if (is_faulted)
  513                                 vdev->v_state = VDEV_STATE_FAULTED;
  514                         else if (is_degraded)
  515                                 vdev->v_state = VDEV_STATE_DEGRADED;
  516                         else
  517                                 vdev->v_state = VDEV_STATE_HEALTHY;
  518                 }
  519         }
  520 
  521         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
  522                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
  523         /*
  524          * Its ok if we don't have any kids.
  525          */
  526         if (rc == 0) {
  527                 vdev->v_nchildren = nkids;
  528                 for (i = 0; i < nkids; i++) {
  529                         rc = vdev_init_from_nvlist(kids, &kid, is_newer);
  530                         if (rc)
  531                                 return (rc);
  532                         if (is_new)
  533                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
  534                                                    v_childlink);
  535                         kids = nvlist_next(kids);
  536                 }
  537         } else {
  538                 vdev->v_nchildren = 0;
  539         }
  540 
  541         if (vdevp)
  542                 *vdevp = vdev;
  543         return (0);
  544 }
  545 
  546 static void
  547 vdev_set_state(vdev_t *vdev)
  548 {
  549         vdev_t *kid;
  550         int good_kids;
  551         int bad_kids;
  552 
  553         /*
  554          * A mirror or raidz is healthy if all its kids are healthy. A
  555          * mirror is degraded if any of its kids is healthy; a raidz
  556          * is degraded if at most nparity kids are offline.
  557          */
  558         if (STAILQ_FIRST(&vdev->v_children)) {
  559                 good_kids = 0;
  560                 bad_kids = 0;
  561                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  562                         if (kid->v_state == VDEV_STATE_HEALTHY)
  563                                 good_kids++;
  564                         else
  565                                 bad_kids++;
  566                 }
  567                 if (bad_kids == 0) {
  568                         vdev->v_state = VDEV_STATE_HEALTHY;
  569                 } else {
  570                         if (vdev->v_read == vdev_mirror_read) {
  571                                 if (good_kids) {
  572                                         vdev->v_state = VDEV_STATE_DEGRADED;
  573                                 } else {
  574                                         vdev->v_state = VDEV_STATE_OFFLINE;
  575                                 }
  576                         } else if (vdev->v_read == vdev_raidz_read) {
  577                                 if (bad_kids > vdev->v_nparity) {
  578                                         vdev->v_state = VDEV_STATE_OFFLINE;
  579                                 } else {
  580                                         vdev->v_state = VDEV_STATE_DEGRADED;
  581                                 }
  582                         }
  583                 }
  584         }
  585 }
  586 
  587 static spa_t *
  588 spa_find_by_guid(uint64_t guid)
  589 {
  590         spa_t *spa;
  591 
  592         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  593                 if (spa->spa_guid == guid)
  594                         return (spa);
  595 
  596         return (0);
  597 }
  598 
  599 #ifdef BOOT2
  600 
  601 static spa_t *
  602 spa_find_by_name(const char *name)
  603 {
  604         spa_t *spa;
  605 
  606         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  607                 if (!strcmp(spa->spa_name, name))
  608                         return (spa);
  609 
  610         return (0);
  611 }
  612 
  613 #endif
  614 
  615 static spa_t *
  616 spa_create(uint64_t guid)
  617 {
  618         spa_t *spa;
  619 
  620         spa = malloc(sizeof(spa_t));
  621         memset(spa, 0, sizeof(spa_t));
  622         STAILQ_INIT(&spa->spa_vdevs);
  623         spa->spa_guid = guid;
  624         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
  625 
  626         return (spa);
  627 }
  628 
  629 static const char *
  630 state_name(vdev_state_t state)
  631 {
  632         static const char* names[] = {
  633                 "UNKNOWN",
  634                 "CLOSED",
  635                 "OFFLINE",
  636                 "REMOVED",
  637                 "CANT_OPEN",
  638                 "FAULTED",
  639                 "DEGRADED",
  640                 "ONLINE"
  641         };
  642         return names[state];
  643 }
  644 
  645 #ifdef BOOT2
  646 
  647 #define pager_printf printf
  648 
  649 #else
  650 
  651 static void
  652 pager_printf(const char *fmt, ...)
  653 {
  654         char line[80];
  655         va_list args;
  656 
  657         va_start(args, fmt);
  658         vsprintf(line, fmt, args);
  659         va_end(args);
  660         pager_output(line);
  661 }
  662 
  663 #endif
  664 
  665 #define STATUS_FORMAT   "        %-16s %-10s\n"
  666 
  667 static void
  668 print_state(int indent, const char *name, vdev_state_t state)
  669 {
  670         int i;
  671         char buf[512];
  672 
  673         buf[0] = 0;
  674         for (i = 0; i < indent; i++)
  675                 strcat(buf, "  ");
  676         strcat(buf, name);
  677         pager_printf(STATUS_FORMAT, buf, state_name(state));
  678         
  679 }
  680 
  681 static void
  682 vdev_status(vdev_t *vdev, int indent)
  683 {
  684         vdev_t *kid;
  685         print_state(indent, vdev->v_name, vdev->v_state);
  686 
  687         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  688                 vdev_status(kid, indent + 1);
  689         }
  690 }
  691 
  692 static void
  693 spa_status(spa_t *spa)
  694 {
  695         vdev_t *vdev;
  696         int good_kids, bad_kids, degraded_kids;
  697         vdev_state_t state;
  698 
  699         pager_printf("  pool: %s\n", spa->spa_name);
  700         pager_printf("config:\n\n");
  701         pager_printf(STATUS_FORMAT, "NAME", "STATE");
  702 
  703         good_kids = 0;
  704         degraded_kids = 0;
  705         bad_kids = 0;
  706         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  707                 if (vdev->v_state == VDEV_STATE_HEALTHY)
  708                         good_kids++;
  709                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
  710                         degraded_kids++;
  711                 else
  712                         bad_kids++;
  713         }
  714 
  715         state = VDEV_STATE_CLOSED;
  716         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
  717                 state = VDEV_STATE_HEALTHY;
  718         else if ((good_kids + degraded_kids) > 0)
  719                 state = VDEV_STATE_DEGRADED;
  720 
  721         print_state(0, spa->spa_name, state);
  722         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  723                 vdev_status(vdev, 1);
  724         }
  725 }
  726 
  727 static void
  728 spa_all_status(void)
  729 {
  730         spa_t *spa;
  731         int first = 1;
  732 
  733         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
  734                 if (!first)
  735                         pager_printf("\n");
  736                 first = 0;
  737                 spa_status(spa);
  738         }
  739 }
  740 
  741 static int
  742 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
  743 {
  744         vdev_t vtmp;
  745         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
  746         spa_t *spa;
  747         vdev_t *vdev, *top_vdev, *pool_vdev;
  748         off_t off;
  749         blkptr_t bp;
  750         const unsigned char *nvlist;
  751         uint64_t val;
  752         uint64_t guid;
  753         uint64_t pool_txg, pool_guid;
  754         const char *pool_name;
  755         const unsigned char *vdevs;
  756         int i, rc, is_newer;
  757         char upbuf[1024];
  758         const struct uberblock *up;
  759 
  760         /*
  761          * Load the vdev label and figure out which
  762          * uberblock is most current.
  763          */
  764         memset(&vtmp, 0, sizeof(vtmp));
  765         vtmp.v_phys_read = read;
  766         vtmp.v_read_priv = read_priv;
  767         off = offsetof(vdev_label_t, vl_vdev_phys);
  768         BP_ZERO(&bp);
  769         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
  770         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
  771         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  772         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  773         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  774         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
  775                 return (EIO);
  776 
  777         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
  778                 return (EIO);
  779         }
  780 
  781         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
  782 
  783         if (nvlist_find(nvlist,
  784                         ZPOOL_CONFIG_VERSION,
  785                         DATA_TYPE_UINT64, 0, &val)) {
  786                 return (EIO);
  787         }
  788 
  789         if (val > SPA_VERSION) {
  790                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
  791                     (unsigned) val, (unsigned) SPA_VERSION);
  792                 return (EIO);
  793         }
  794 
  795         if (nvlist_find(nvlist,
  796                         ZPOOL_CONFIG_POOL_STATE,
  797                         DATA_TYPE_UINT64, 0, &val)) {
  798                 return (EIO);
  799         }
  800 
  801 #ifndef TEST
  802         if (val != POOL_STATE_ACTIVE) {
  803                 /*
  804                  * Don't print a message here. If we happen to reboot
  805                  * while where is an exported pool around, we don't
  806                  * need a cascade of confusing messages during boot.
  807                  */
  808                 /*printf("ZFS: pool is not active\n");*/
  809                 return (EIO);
  810         }
  811 #endif
  812 
  813         if (nvlist_find(nvlist,
  814                         ZPOOL_CONFIG_POOL_TXG,
  815                         DATA_TYPE_UINT64, 0, &pool_txg)
  816             || nvlist_find(nvlist,
  817                            ZPOOL_CONFIG_POOL_GUID,
  818                            DATA_TYPE_UINT64, 0, &pool_guid)
  819             || nvlist_find(nvlist,
  820                            ZPOOL_CONFIG_POOL_NAME,
  821                            DATA_TYPE_STRING, 0, &pool_name)) {
  822                 /*
  823                  * Cache and spare devices end up here - just ignore
  824                  * them.
  825                  */
  826                 /*printf("ZFS: can't find pool details\n");*/
  827                 return (EIO);
  828         }
  829 
  830         /*
  831          * Create the pool if this is the first time we've seen it.
  832          */
  833         spa = spa_find_by_guid(pool_guid);
  834         if (!spa) {
  835                 spa = spa_create(pool_guid);
  836                 spa->spa_name = strdup(pool_name);
  837         }
  838         if (pool_txg > spa->spa_txg) {
  839                 spa->spa_txg = pool_txg;
  840                 is_newer = 1;
  841         } else
  842                 is_newer = 0;
  843 
  844         /*
  845          * Get the vdev tree and create our in-core copy of it.
  846          * If we already have a vdev with this guid, this must
  847          * be some kind of alias (overlapping slices, dangerously dedicated
  848          * disks etc).
  849          */
  850         if (nvlist_find(nvlist,
  851                         ZPOOL_CONFIG_GUID,
  852                         DATA_TYPE_UINT64, 0, &guid)) {
  853                 return (EIO);
  854         }
  855         vdev = vdev_find(guid);
  856         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
  857                 return (EIO);
  858 
  859         if (nvlist_find(nvlist,
  860                         ZPOOL_CONFIG_VDEV_TREE,
  861                         DATA_TYPE_NVLIST, 0, &vdevs)) {
  862                 return (EIO);
  863         }
  864 
  865         rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
  866         if (rc)
  867                 return (rc);
  868 
  869         /*
  870          * Add the toplevel vdev to the pool if its not already there.
  871          */
  872         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
  873                 if (top_vdev == pool_vdev)
  874                         break;
  875         if (!pool_vdev && top_vdev)
  876                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
  877 
  878         /*
  879          * We should already have created an incomplete vdev for this
  880          * vdev. Find it and initialise it with our read proc.
  881          */
  882         vdev = vdev_find(guid);
  883         if (vdev) {
  884                 vdev->v_phys_read = read;
  885                 vdev->v_read_priv = read_priv;
  886         } else {
  887                 printf("ZFS: inconsistent nvlist contents\n");
  888                 return (EIO);
  889         }
  890 
  891         /*
  892          * Re-evaluate top-level vdev state.
  893          */
  894         vdev_set_state(top_vdev);
  895 
  896         /*
  897          * Ok, we are happy with the pool so far. Lets find
  898          * the best uberblock and then we can actually access
  899          * the contents of the pool.
  900          */
  901         for (i = 0;
  902              i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
  903              i++) {
  904                 off = offsetof(vdev_label_t, vl_uberblock);
  905                 off += i << UBERBLOCK_SHIFT;
  906                 BP_ZERO(&bp);
  907                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
  908                 BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
  909                 BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
  910                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  911                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  912                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  913                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
  914                         continue;
  915 
  916                 up = (const struct uberblock *) upbuf;
  917                 if (up->ub_magic != UBERBLOCK_MAGIC)
  918                         continue;
  919                 if (up->ub_txg < spa->spa_txg)
  920                         continue;
  921                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
  922                         spa->spa_uberblock = *up;
  923                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
  924                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
  925                                 spa->spa_uberblock = *up;
  926                 }
  927         }
  928 
  929         if (spap)
  930                 *spap = spa;
  931         return (0);
  932 }
  933 
  934 static int
  935 ilog2(int n)
  936 {
  937         int v;
  938 
  939         for (v = 0; v < 32; v++)
  940                 if (n == (1 << v))
  941                         return v;
  942         return -1;
  943 }
  944 
  945 static int
  946 zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
  947 {
  948         zio_gbh_phys_t zio_gb;
  949         vdev_t *vdev;
  950         int vdevid;
  951         off_t offset;
  952         int i;
  953 
  954         vdevid = DVA_GET_VDEV(dva);
  955         offset = DVA_GET_OFFSET(dva);
  956         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
  957                 if (vdev->v_id == vdevid)
  958                         break;
  959         if (!vdev || !vdev->v_read)
  960                 return (EIO);
  961         if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
  962                 return (EIO);
  963 
  964         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
  965                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
  966 
  967                 if (BP_IS_HOLE(gbp))
  968                         continue;
  969                 if (zio_read(spa, gbp, buf))
  970                         return (EIO);
  971                 buf = (char*)buf + BP_GET_PSIZE(gbp);
  972         }
  973  
  974         return (0);
  975 }
  976 
  977 static int
  978 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
  979 {
  980         int cpfunc = BP_GET_COMPRESS(bp);
  981         size_t lsize = BP_GET_LSIZE(bp);
  982         size_t psize = BP_GET_PSIZE(bp);
  983         void *pbuf;
  984         int i;
  985 
  986         zfs_reset_temp();
  987         if (cpfunc != ZIO_COMPRESS_OFF)
  988                 pbuf = zfs_alloc_temp(psize);
  989         else
  990                 pbuf = buf;
  991 
  992         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
  993                 const dva_t *dva = &bp->blk_dva[i];
  994                 vdev_t *vdev;
  995                 int vdevid;
  996                 off_t offset;
  997 
  998                 if (!dva->dva_word[0] && !dva->dva_word[1])
  999                         continue;
 1000 
 1001                 if (DVA_GET_GANG(dva)) {
 1002                         if (zio_read_gang(spa, bp, dva, buf))
 1003                                 continue;
 1004                 } else {
 1005                         vdevid = DVA_GET_VDEV(dva);
 1006                         offset = DVA_GET_OFFSET(dva);
 1007                         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
 1008                                 if (vdev->v_id == vdevid)
 1009                                         break;
 1010                         if (!vdev || !vdev->v_read) {
 1011                                 continue;
 1012                         }
 1013                         if (vdev->v_read(vdev, bp, pbuf, offset, psize))
 1014                                 continue;
 1015 
 1016                         if (cpfunc != ZIO_COMPRESS_OFF) {
 1017                                 if (zio_decompress_data(cpfunc, pbuf, psize,
 1018                                     buf, lsize))
 1019                                         return (EIO);
 1020                         }
 1021                 }
 1022 
 1023                 return (0);
 1024         }
 1025         printf("ZFS: i/o error - all block copies unavailable\n");
 1026 
 1027         return (EIO);
 1028 }
 1029 
 1030 static int
 1031 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
 1032 {
 1033         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 1034         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1035         int nlevels = dnode->dn_nlevels;
 1036         int i, rc;
 1037 
 1038         /*
 1039          * Note: bsize may not be a power of two here so we need to do an
 1040          * actual divide rather than a bitshift.
 1041          */
 1042         while (buflen > 0) {
 1043                 uint64_t bn = offset / bsize;
 1044                 int boff = offset % bsize;
 1045                 int ibn;
 1046                 const blkptr_t *indbp;
 1047                 blkptr_t bp;
 1048 
 1049                 if (bn > dnode->dn_maxblkid)
 1050                         return (EIO);
 1051 
 1052                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 1053                         goto cached;
 1054 
 1055                 indbp = dnode->dn_blkptr;
 1056                 for (i = 0; i < nlevels; i++) {
 1057                         /*
 1058                          * Copy the bp from the indirect array so that
 1059                          * we can re-use the scratch buffer for multi-level
 1060                          * objects.
 1061                          */
 1062                         ibn = bn >> ((nlevels - i - 1) * ibshift);
 1063                         ibn &= ((1 << ibshift) - 1);
 1064                         bp = indbp[ibn];
 1065                         rc = zio_read(spa, &bp, dnode_cache_buf);
 1066                         if (rc)
 1067                                 return (rc);
 1068                         indbp = (const blkptr_t *) dnode_cache_buf;
 1069                 }
 1070                 dnode_cache_obj = dnode;
 1071                 dnode_cache_bn = bn;
 1072         cached:
 1073 
 1074                 /*
 1075                  * The buffer contains our data block. Copy what we
 1076                  * need from it and loop.
 1077                  */ 
 1078                 i = bsize - boff;
 1079                 if (i > buflen) i = buflen;
 1080                 memcpy(buf, &dnode_cache_buf[boff], i);
 1081                 buf = ((char*) buf) + i;
 1082                 offset += i;
 1083                 buflen -= i;
 1084         }
 1085 
 1086         return (0);
 1087 }
 1088 
 1089 /*
 1090  * Lookup a value in a microzap directory. Assumes that the zap
 1091  * scratch buffer contains the directory contents.
 1092  */
 1093 static int
 1094 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1095 {
 1096         const mzap_phys_t *mz;
 1097         const mzap_ent_phys_t *mze;
 1098         size_t size;
 1099         int chunks, i;
 1100 
 1101         /*
 1102          * Microzap objects use exactly one block. Read the whole
 1103          * thing.
 1104          */
 1105         size = dnode->dn_datablkszsec * 512;
 1106 
 1107         mz = (const mzap_phys_t *) zap_scratch;
 1108         chunks = size / MZAP_ENT_LEN - 1;
 1109 
 1110         for (i = 0; i < chunks; i++) {
 1111                 mze = &mz->mz_chunk[i];
 1112                 if (!strcmp(mze->mze_name, name)) {
 1113                         *value = mze->mze_value;
 1114                         return (0);
 1115                 }
 1116         }
 1117 
 1118         return (ENOENT);
 1119 }
 1120 
 1121 /*
 1122  * Compare a name with a zap leaf entry. Return non-zero if the name
 1123  * matches.
 1124  */
 1125 static int
 1126 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 1127 {
 1128         size_t namelen;
 1129         const zap_leaf_chunk_t *nc;
 1130         const char *p;
 1131 
 1132         namelen = zc->l_entry.le_name_length;
 1133                         
 1134         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1135         p = name;
 1136         while (namelen > 0) {
 1137                 size_t len;
 1138                 len = namelen;
 1139                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1140                         len = ZAP_LEAF_ARRAY_BYTES;
 1141                 if (memcmp(p, nc->l_array.la_array, len))
 1142                         return (0);
 1143                 p += len;
 1144                 namelen -= len;
 1145                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1146         }
 1147 
 1148         return 1;
 1149 }
 1150 
 1151 /*
 1152  * Extract a uint64_t value from a zap leaf entry.
 1153  */
 1154 static uint64_t
 1155 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 1156 {
 1157         const zap_leaf_chunk_t *vc;
 1158         int i;
 1159         uint64_t value;
 1160         const uint8_t *p;
 1161 
 1162         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 1163         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 1164                 value = (value << 8) | p[i];
 1165         }
 1166 
 1167         return value;
 1168 }
 1169 
 1170 /*
 1171  * Lookup a value in a fatzap directory. Assumes that the zap scratch
 1172  * buffer contains the directory header.
 1173  */
 1174 static int
 1175 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1176 {
 1177         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1178         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1179         fat_zap_t z;
 1180         uint64_t *ptrtbl;
 1181         uint64_t hash;
 1182         int rc;
 1183 
 1184         if (zh.zap_magic != ZAP_MAGIC)
 1185                 return (EIO);
 1186 
 1187         z.zap_block_shift = ilog2(bsize);
 1188         z.zap_phys = (zap_phys_t *) zap_scratch;
 1189 
 1190         /*
 1191          * Figure out where the pointer table is and read it in if necessary.
 1192          */
 1193         if (zh.zap_ptrtbl.zt_blk) {
 1194                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1195                                zap_scratch, bsize);
 1196                 if (rc)
 1197                         return (rc);
 1198                 ptrtbl = (uint64_t *) zap_scratch;
 1199         } else {
 1200                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1201         }
 1202 
 1203         hash = zap_hash(zh.zap_salt, name);
 1204 
 1205         zap_leaf_t zl;
 1206         zl.l_bs = z.zap_block_shift;
 1207 
 1208         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1209         zap_leaf_chunk_t *zc;
 1210 
 1211         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1212         if (rc)
 1213                 return (rc);
 1214 
 1215         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1216 
 1217         /*
 1218          * Make sure this chunk matches our hash.
 1219          */
 1220         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1221             && zl.l_phys->l_hdr.lh_prefix
 1222             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1223                 return (ENOENT);
 1224 
 1225         /*
 1226          * Hash within the chunk to find our entry.
 1227          */
 1228         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1229         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1230         h = zl.l_phys->l_hash[h];
 1231         if (h == 0xffff)
 1232                 return (ENOENT);
 1233         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1234         while (zc->l_entry.le_hash != hash) {
 1235                 if (zc->l_entry.le_next == 0xffff) {
 1236                         zc = 0;
 1237                         break;
 1238                 }
 1239                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1240         }
 1241         if (fzap_name_equal(&zl, zc, name)) {
 1242                 *value = fzap_leaf_value(&zl, zc);
 1243                 return (0);
 1244         }
 1245 
 1246         return (ENOENT);
 1247 }
 1248 
 1249 /*
 1250  * Lookup a name in a zap object and return its value as a uint64_t.
 1251  */
 1252 static int
 1253 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1254 {
 1255         int rc;
 1256         uint64_t zap_type;
 1257         size_t size = dnode->dn_datablkszsec * 512;
 1258 
 1259         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1260         if (rc)
 1261                 return (rc);
 1262 
 1263         zap_type = *(uint64_t *) zap_scratch;
 1264         if (zap_type == ZBT_MICRO)
 1265                 return mzap_lookup(spa, dnode, name, value);
 1266         else
 1267                 return fzap_lookup(spa, dnode, name, value);
 1268 }
 1269 
 1270 #ifdef BOOT2
 1271 
 1272 /*
 1273  * List a microzap directory. Assumes that the zap scratch buffer contains
 1274  * the directory contents.
 1275  */
 1276 static int
 1277 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1278 {
 1279         const mzap_phys_t *mz;
 1280         const mzap_ent_phys_t *mze;
 1281         size_t size;
 1282         int chunks, i;
 1283 
 1284         /*
 1285          * Microzap objects use exactly one block. Read the whole
 1286          * thing.
 1287          */
 1288         size = dnode->dn_datablkszsec * 512;
 1289         mz = (const mzap_phys_t *) zap_scratch;
 1290         chunks = size / MZAP_ENT_LEN - 1;
 1291 
 1292         for (i = 0; i < chunks; i++) {
 1293                 mze = &mz->mz_chunk[i];
 1294                 if (mze->mze_name[0])
 1295                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
 1296                         printf("%s\n", mze->mze_name);
 1297         }
 1298 
 1299         return (0);
 1300 }
 1301 
 1302 /*
 1303  * List a fatzap directory. Assumes that the zap scratch buffer contains
 1304  * the directory header.
 1305  */
 1306 static int
 1307 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1308 {
 1309         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1310         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1311         fat_zap_t z;
 1312         int i, j;
 1313 
 1314         if (zh.zap_magic != ZAP_MAGIC)
 1315                 return (EIO);
 1316 
 1317         z.zap_block_shift = ilog2(bsize);
 1318         z.zap_phys = (zap_phys_t *) zap_scratch;
 1319 
 1320         /*
 1321          * This assumes that the leaf blocks start at block 1. The
 1322          * documentation isn't exactly clear on this.
 1323          */
 1324         zap_leaf_t zl;
 1325         zl.l_bs = z.zap_block_shift;
 1326         for (i = 0; i < zh.zap_num_leafs; i++) {
 1327                 off_t off = (i + 1) << zl.l_bs;
 1328                 char name[256], *p;
 1329                 uint64_t value;
 1330 
 1331                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 1332                         return (EIO);
 1333 
 1334                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1335 
 1336                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 1337                         zap_leaf_chunk_t *zc, *nc;
 1338                         int namelen;
 1339 
 1340                         zc = &ZAP_LEAF_CHUNK(&zl, j);
 1341                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 1342                                 continue;
 1343                         namelen = zc->l_entry.le_name_length;
 1344                         if (namelen > sizeof(name))
 1345                                 namelen = sizeof(name);
 1346                         
 1347                         /*
 1348                          * Paste the name back together.
 1349                          */
 1350                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 1351                         p = name;
 1352                         while (namelen > 0) {
 1353                                 int len;
 1354                                 len = namelen;
 1355                                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1356                                         len = ZAP_LEAF_ARRAY_BYTES;
 1357                                 memcpy(p, nc->l_array.la_array, len);
 1358                                 p += len;
 1359                                 namelen -= len;
 1360                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 1361                         }
 1362 
 1363                         /*
 1364                          * Assume the first eight bytes of the value are
 1365                          * a uint64_t.
 1366                          */
 1367                         value = fzap_leaf_value(&zl, zc);
 1368 
 1369                         printf("%-32s 0x%llx\n", name, value);
 1370                 }
 1371         }
 1372 
 1373         return (0);
 1374 }
 1375 
 1376 /*
 1377  * List a zap directory.
 1378  */
 1379 static int
 1380 zap_list(spa_t *spa, const dnode_phys_t *dnode)
 1381 {
 1382         uint64_t zap_type;
 1383         size_t size = dnode->dn_datablkszsec * 512;
 1384 
 1385         if (dnode_read(spa, dnode, 0, zap_scratch, size))
 1386                 return (EIO);
 1387 
 1388         zap_type = *(uint64_t *) zap_scratch;
 1389         if (zap_type == ZBT_MICRO)
 1390                 return mzap_list(spa, dnode);
 1391         else
 1392                 return fzap_list(spa, dnode);
 1393 }
 1394 
 1395 #endif
 1396 
 1397 static int
 1398 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 1399 {
 1400         off_t offset;
 1401 
 1402         offset = objnum * sizeof(dnode_phys_t);
 1403         return dnode_read(spa, &os->os_meta_dnode, offset,
 1404                 dnode, sizeof(dnode_phys_t));
 1405 }
 1406 
 1407 /*
 1408  * Find the object set given the object number of its dataset object
 1409  * and return its details in *objset
 1410  */
 1411 static int
 1412 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 1413 {
 1414         dnode_phys_t dataset;
 1415         dsl_dataset_phys_t *ds;
 1416 
 1417         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1418                 printf("ZFS: can't find dataset %llu\n", objnum);
 1419                 return (EIO);
 1420         }
 1421 
 1422         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 1423         if (zio_read(spa, &ds->ds_bp, objset)) {
 1424                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
 1425                 return (EIO);
 1426         }
 1427 
 1428         return (0);
 1429 }
 1430 
 1431 /*
 1432  * Find the object set pointed to by the BOOTFS property or the root
 1433  * dataset if there is none and return its details in *objset
 1434  */
 1435 static int
 1436 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
 1437 {
 1438         dnode_phys_t dir, propdir;
 1439         uint64_t props, bootfs, root;
 1440 
 1441         /*
 1442          * Start with the MOS directory object.
 1443          */
 1444         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 1445                 printf("ZFS: can't read MOS object directory\n");
 1446                 return (EIO);
 1447         }
 1448 
 1449         /*
 1450          * Lookup the pool_props and see if we can find a bootfs.
 1451          */
 1452         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
 1453              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 1454              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
 1455              && bootfs != 0)
 1456                 return zfs_mount_dataset(spa, bootfs, objset);
 1457 
 1458         /*
 1459          * Lookup the root dataset directory
 1460          */
 1461         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
 1462             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 1463                 printf("ZFS: can't find root dsl_dir\n");
 1464                 return (EIO);
 1465         }
 1466 
 1467         /*
 1468          * Use the information from the dataset directory's bonus buffer
 1469          * to find the dataset object and from that the object set itself.
 1470          */
 1471         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 1472         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
 1473 }
 1474 
 1475 static int
 1476 zfs_mount_pool(spa_t *spa)
 1477 {
 1478         /*
 1479          * Find the MOS and work our way in from there.
 1480          */
 1481         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 1482                 printf("ZFS: can't read MOS\n");
 1483                 return (EIO);
 1484         }
 1485 
 1486         /*
 1487          * Find the root object set
 1488          */
 1489         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
 1490                 printf("Can't find root filesystem - giving up\n");
 1491                 return (EIO);
 1492         }
 1493 
 1494         return (0);
 1495 }
 1496 
 1497 /*
 1498  * Lookup a file and return its dnode.
 1499  */
 1500 static int
 1501 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
 1502 {
 1503         int rc;
 1504         uint64_t objnum, rootnum, parentnum;
 1505         dnode_phys_t dn;
 1506         const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
 1507         const char *p, *q;
 1508         char element[256];
 1509         char path[1024];
 1510         int symlinks_followed = 0;
 1511 
 1512         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
 1513                 printf("ZFS: unexpected object set type %llu\n",
 1514                        spa->spa_root_objset.os_type);
 1515                 return (EIO);
 1516         }
 1517 
 1518         /*
 1519          * Get the root directory dnode.
 1520          */
 1521         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
 1522         if (rc)
 1523                 return (rc);
 1524 
 1525         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
 1526         if (rc)
 1527                 return (rc);
 1528 
 1529         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
 1530         if (rc)
 1531                 return (rc);
 1532 
 1533         objnum = rootnum;
 1534         p = upath;
 1535         while (p && *p) {
 1536                 while (*p == '/')
 1537                         p++;
 1538                 if (!*p)
 1539                         break;
 1540                 q = strchr(p, '/');
 1541                 if (q) {
 1542                         memcpy(element, p, q - p);
 1543                         element[q - p] = 0;
 1544                         p = q;
 1545                 } else {
 1546                         strcpy(element, p);
 1547                         p = 0;
 1548                 }
 1549 
 1550                 if ((zp->zp_mode >> 12) != 0x4) {
 1551                         return (ENOTDIR);
 1552                 }
 1553 
 1554                 parentnum = objnum;
 1555                 rc = zap_lookup(spa, &dn, element, &objnum);
 1556                 if (rc)
 1557                         return (rc);
 1558                 objnum = ZFS_DIRENT_OBJ(objnum);
 1559 
 1560                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1561                 if (rc)
 1562                         return (rc);
 1563 
 1564                 /*
 1565                  * Check for symlink.
 1566                  */
 1567                 if ((zp->zp_mode >> 12) == 0xa) {
 1568                         if (symlinks_followed > 10)
 1569                                 return (EMLINK);
 1570                         symlinks_followed++;
 1571 
 1572                         /*
 1573                          * Read the link value and copy the tail of our
 1574                          * current path onto the end.
 1575                          */
 1576                         if (p)
 1577                                 strcpy(&path[zp->zp_size], p);
 1578                         else
 1579                                 path[zp->zp_size] = 0;
 1580                         if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
 1581                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 1582                                         zp->zp_size);
 1583                         } else {
 1584                                 rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
 1585                                 if (rc)
 1586                                         return (rc);
 1587                         }
 1588 
 1589                         /*
 1590                          * Restart with the new path, starting either at
 1591                          * the root or at the parent depending whether or
 1592                          * not the link is relative.
 1593                          */
 1594                         p = path;
 1595                         if (*p == '/')
 1596                                 objnum = rootnum;
 1597                         else
 1598                                 objnum = parentnum;
 1599                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1600                 }
 1601         }
 1602 
 1603         *dnode = dn;
 1604         return (0);
 1605 }

Cache object: cc4874433c3615e4e13872eb506e45c3


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.