The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2007 Doug Rabson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/7.3/sys/boot/zfs/zfsimpl.c 200690 2009-12-18 21:02:32Z jhb $");
   29 
   30 /*
   31  *      Stand-alone ZFS file reader.
   32  */
   33 
   34 #include "zfsimpl.h"
   35 #include "zfssubr.c"
   36 
   37 /*
   38  * List of all vdevs, chained through v_alllink.
   39  */
   40 static vdev_list_t zfs_vdevs;
   41 
   42 /*
   43  * List of all pools, chained through spa_link.
   44  */
   45 static spa_list_t zfs_pools;
   46 
   47 static uint64_t zfs_crc64_table[256];
   48 static const dnode_phys_t *dnode_cache_obj = 0;
   49 static uint64_t dnode_cache_bn;
   50 static char *dnode_cache_buf;
   51 static char *zap_scratch;
   52 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
   53 
   54 #define TEMP_SIZE       (1024 * 1024)
   55 
   56 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
   57 
   58 static void
   59 zfs_init(void)
   60 {
   61         STAILQ_INIT(&zfs_vdevs);
   62         STAILQ_INIT(&zfs_pools);
   63 
   64         zfs_temp_buf = malloc(TEMP_SIZE);
   65         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
   66         zfs_temp_ptr = zfs_temp_buf;
   67         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
   68         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
   69 
   70         zfs_init_crc();
   71 }
   72 
   73 static char *
   74 zfs_alloc_temp(size_t sz)
   75 {
   76         char *p;
   77 
   78         if (zfs_temp_ptr + sz > zfs_temp_end) {
   79                 printf("ZFS: out of temporary buffer space\n");
   80                 for (;;) ;
   81         }
   82         p = zfs_temp_ptr;
   83         zfs_temp_ptr += sz;
   84 
   85         return (p);
   86 }
   87 
   88 static void
   89 zfs_reset_temp(void)
   90 {
   91 
   92         zfs_temp_ptr = zfs_temp_buf;
   93 }
   94 
   95 static int
   96 xdr_int(const unsigned char **xdr, int *ip)
   97 {
   98         *ip = ((*xdr)[0] << 24)
   99                 | ((*xdr)[1] << 16)
  100                 | ((*xdr)[2] << 8)
  101                 | ((*xdr)[3] << 0);
  102         (*xdr) += 4;
  103         return (0);
  104 }
  105 
  106 static int
  107 xdr_u_int(const unsigned char **xdr, u_int *ip)
  108 {
  109         *ip = ((*xdr)[0] << 24)
  110                 | ((*xdr)[1] << 16)
  111                 | ((*xdr)[2] << 8)
  112                 | ((*xdr)[3] << 0);
  113         (*xdr) += 4;
  114         return (0);
  115 }
  116 
  117 static int
  118 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
  119 {
  120         u_int hi, lo;
  121 
  122         xdr_u_int(xdr, &hi);
  123         xdr_u_int(xdr, &lo);
  124         *lp = (((uint64_t) hi) << 32) | lo;
  125         return (0);
  126 }
  127 
  128 static int
  129 nvlist_find(const unsigned char *nvlist, const char *name, int type,
  130             int* elementsp, void *valuep)
  131 {
  132         const unsigned char *p, *pair;
  133         int junk;
  134         int encoded_size, decoded_size;
  135 
  136         p = nvlist;
  137         xdr_int(&p, &junk);
  138         xdr_int(&p, &junk);
  139 
  140         pair = p;
  141         xdr_int(&p, &encoded_size);
  142         xdr_int(&p, &decoded_size);
  143         while (encoded_size && decoded_size) {
  144                 int namelen, pairtype, elements;
  145                 const char *pairname;
  146 
  147                 xdr_int(&p, &namelen);
  148                 pairname = (const char*) p;
  149                 p += roundup(namelen, 4);
  150                 xdr_int(&p, &pairtype);
  151 
  152                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
  153                         xdr_int(&p, &elements);
  154                         if (elementsp)
  155                                 *elementsp = elements;
  156                         if (type == DATA_TYPE_UINT64) {
  157                                 xdr_uint64_t(&p, (uint64_t *) valuep);
  158                                 return (0);
  159                         } else if (type == DATA_TYPE_STRING) {
  160                                 int len;
  161                                 xdr_int(&p, &len);
  162                                 (*(const char**) valuep) = (const char*) p;
  163                                 return (0);
  164                         } else if (type == DATA_TYPE_NVLIST
  165                                    || type == DATA_TYPE_NVLIST_ARRAY) {
  166                                 (*(const unsigned char**) valuep) =
  167                                          (const unsigned char*) p;
  168                                 return (0);
  169                         } else {
  170                                 return (EIO);
  171                         }
  172                 } else {
  173                         /*
  174                          * Not the pair we are looking for, skip to the next one.
  175                          */
  176                         p = pair + encoded_size;
  177                 }
  178 
  179                 pair = p;
  180                 xdr_int(&p, &encoded_size);
  181                 xdr_int(&p, &decoded_size);
  182         }
  183 
  184         return (EIO);
  185 }
  186 
  187 /*
  188  * Return the next nvlist in an nvlist array.
  189  */
  190 static const unsigned char *
  191 nvlist_next(const unsigned char *nvlist)
  192 {
  193         const unsigned char *p, *pair;
  194         int junk;
  195         int encoded_size, decoded_size;
  196 
  197         p = nvlist;
  198         xdr_int(&p, &junk);
  199         xdr_int(&p, &junk);
  200 
  201         pair = p;
  202         xdr_int(&p, &encoded_size);
  203         xdr_int(&p, &decoded_size);
  204         while (encoded_size && decoded_size) {
  205                 p = pair + encoded_size;
  206 
  207                 pair = p;
  208                 xdr_int(&p, &encoded_size);
  209                 xdr_int(&p, &decoded_size);
  210         }
  211 
  212         return p;
  213 }
  214 
  215 #ifdef TEST
  216 
  217 static const unsigned char *
  218 nvlist_print(const unsigned char *nvlist, unsigned int indent)
  219 {
  220         static const char* typenames[] = {
  221                 "DATA_TYPE_UNKNOWN",
  222                 "DATA_TYPE_BOOLEAN",
  223                 "DATA_TYPE_BYTE",
  224                 "DATA_TYPE_INT16",
  225                 "DATA_TYPE_UINT16",
  226                 "DATA_TYPE_INT32",
  227                 "DATA_TYPE_UINT32",
  228                 "DATA_TYPE_INT64",
  229                 "DATA_TYPE_UINT64",
  230                 "DATA_TYPE_STRING",
  231                 "DATA_TYPE_BYTE_ARRAY",
  232                 "DATA_TYPE_INT16_ARRAY",
  233                 "DATA_TYPE_UINT16_ARRAY",
  234                 "DATA_TYPE_INT32_ARRAY",
  235                 "DATA_TYPE_UINT32_ARRAY",
  236                 "DATA_TYPE_INT64_ARRAY",
  237                 "DATA_TYPE_UINT64_ARRAY",
  238                 "DATA_TYPE_STRING_ARRAY",
  239                 "DATA_TYPE_HRTIME",
  240                 "DATA_TYPE_NVLIST",
  241                 "DATA_TYPE_NVLIST_ARRAY",
  242                 "DATA_TYPE_BOOLEAN_VALUE",
  243                 "DATA_TYPE_INT8",
  244                 "DATA_TYPE_UINT8",
  245                 "DATA_TYPE_BOOLEAN_ARRAY",
  246                 "DATA_TYPE_INT8_ARRAY",
  247                 "DATA_TYPE_UINT8_ARRAY"
  248         };
  249 
  250         unsigned int i, j;
  251         const unsigned char *p, *pair;
  252         int junk;
  253         int encoded_size, decoded_size;
  254 
  255         p = nvlist;
  256         xdr_int(&p, &junk);
  257         xdr_int(&p, &junk);
  258 
  259         pair = p;
  260         xdr_int(&p, &encoded_size);
  261         xdr_int(&p, &decoded_size);
  262         while (encoded_size && decoded_size) {
  263                 int namelen, pairtype, elements;
  264                 const char *pairname;
  265 
  266                 xdr_int(&p, &namelen);
  267                 pairname = (const char*) p;
  268                 p += roundup(namelen, 4);
  269                 xdr_int(&p, &pairtype);
  270 
  271                 for (i = 0; i < indent; i++)
  272                         printf(" ");
  273                 printf("%s %s", typenames[pairtype], pairname);
  274 
  275                 xdr_int(&p, &elements);
  276                 switch (pairtype) {
  277                 case DATA_TYPE_UINT64: {
  278                         uint64_t val;
  279                         xdr_uint64_t(&p, &val);
  280                         printf(" = 0x%llx\n", val);
  281                         break;
  282                 }
  283 
  284                 case DATA_TYPE_STRING: {
  285                         int len;
  286                         xdr_int(&p, &len);
  287                         printf(" = \"%s\"\n", p);
  288                         break;
  289                 }
  290 
  291                 case DATA_TYPE_NVLIST:
  292                         printf("\n");
  293                         nvlist_print(p, indent + 1);
  294                         break;
  295 
  296                 case DATA_TYPE_NVLIST_ARRAY:
  297                         for (j = 0; j < elements; j++) {
  298                                 printf("[%d]\n", j);
  299                                 p = nvlist_print(p, indent + 1);
  300                                 if (j != elements - 1) {
  301                                         for (i = 0; i < indent; i++)
  302                                                 printf(" ");
  303                                         printf("%s %s", typenames[pairtype], pairname);
  304                                 }
  305                         }
  306                         break;
  307 
  308                 default:
  309                         printf("\n");
  310                 }
  311 
  312                 p = pair + encoded_size;
  313 
  314                 pair = p;
  315                 xdr_int(&p, &encoded_size);
  316                 xdr_int(&p, &decoded_size);
  317         }
  318 
  319         return p;
  320 }
  321 
  322 #endif
  323 
  324 static int
  325 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
  326     off_t offset, size_t size)
  327 {
  328         size_t psize;
  329         int rc;
  330 
  331         if (bp) {
  332                 psize = BP_GET_PSIZE(bp);
  333         } else {
  334                 psize = size;
  335         }
  336 
  337         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
  338         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
  339         if (rc)
  340                 return (rc);
  341         if (bp && zio_checksum_error(bp, buf))
  342                 return (EIO);
  343 
  344         return (0);
  345 }
  346 
  347 static int
  348 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  349     off_t offset, size_t bytes)
  350 {
  351 
  352         return (vdev_read_phys(vdev, bp, buf,
  353                 offset + VDEV_LABEL_START_SIZE, bytes));
  354 }
  355 
  356 
  357 static int
  358 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  359     off_t offset, size_t bytes)
  360 {
  361         vdev_t *kid;
  362         int rc;
  363 
  364         rc = EIO;
  365         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  366                 if (kid->v_state != VDEV_STATE_HEALTHY)
  367                         continue;
  368                 rc = kid->v_read(kid, bp, buf, offset, bytes);
  369                 if (!rc)
  370                         return (0);
  371         }
  372 
  373         return (rc);
  374 }
  375 
  376 static vdev_t *
  377 vdev_find(uint64_t guid)
  378 {
  379         vdev_t *vdev;
  380 
  381         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
  382                 if (vdev->v_guid == guid)
  383                         return (vdev);
  384 
  385         return (0);
  386 }
  387 
  388 static vdev_t *
  389 vdev_create(uint64_t guid, vdev_read_t *read)
  390 {
  391         vdev_t *vdev;
  392 
  393         vdev = malloc(sizeof(vdev_t));
  394         memset(vdev, 0, sizeof(vdev_t));
  395         STAILQ_INIT(&vdev->v_children);
  396         vdev->v_guid = guid;
  397         vdev->v_state = VDEV_STATE_OFFLINE;
  398         vdev->v_read = read;
  399         vdev->v_phys_read = 0;
  400         vdev->v_read_priv = 0;
  401         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
  402 
  403         return (vdev);
  404 }
  405 
  406 static int
  407 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
  408 {
  409         int rc;
  410         uint64_t guid, id, ashift, nparity;
  411         const char *type;
  412         const char *path;
  413         vdev_t *vdev, *kid;
  414         const unsigned char *kids;
  415         int nkids, i;
  416 
  417         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
  418                         DATA_TYPE_UINT64, 0, &guid)
  419             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
  420                            DATA_TYPE_UINT64, 0, &id)
  421             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
  422                            DATA_TYPE_STRING, 0, &type)) {
  423                 printf("ZFS: can't find vdev details\n");
  424                 return (ENOENT);
  425         }
  426 
  427         /*
  428          * Assume that if we've seen this vdev tree before, this one
  429          * will be identical.
  430          */
  431         vdev = vdev_find(guid);
  432         if (vdev) {
  433                 if (vdevp)
  434                         *vdevp = vdev;
  435                 return (0);
  436         }
  437 
  438         if (strcmp(type, VDEV_TYPE_MIRROR)
  439             && strcmp(type, VDEV_TYPE_DISK)
  440             && strcmp(type, VDEV_TYPE_RAIDZ)) {
  441                 printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
  442                 return (EIO);
  443         }
  444 
  445         if (!strcmp(type, VDEV_TYPE_MIRROR))
  446                 vdev = vdev_create(guid, vdev_mirror_read);
  447         else if (!strcmp(type, VDEV_TYPE_RAIDZ))
  448                 vdev = vdev_create(guid, vdev_raidz_read);
  449         else
  450                 vdev = vdev_create(guid, vdev_disk_read);
  451 
  452         vdev->v_id = id;
  453         if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
  454                 DATA_TYPE_UINT64, 0, &ashift) == 0)
  455                 vdev->v_ashift = ashift;
  456         else
  457                 vdev->v_ashift = 0;
  458         if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
  459                 DATA_TYPE_UINT64, 0, &nparity) == 0)
  460                 vdev->v_nparity = nparity;
  461         else
  462                 vdev->v_nparity = 0;
  463         if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
  464                         DATA_TYPE_STRING, 0, &path) == 0) {
  465                 if (strlen(path) > 5
  466                     && path[0] == '/'
  467                     && path[1] == 'd'
  468                     && path[2] == 'e'
  469                     && path[3] == 'v'
  470                     && path[4] == '/')
  471                         path += 5;
  472                 vdev->v_name = strdup(path);
  473         } else {
  474                 if (!strcmp(type, "raidz")) {
  475                         if (vdev->v_nparity == 1)
  476                                 vdev->v_name = "raidz1";
  477                         else
  478                                 vdev->v_name = "raidz2";
  479                 } else {
  480                         vdev->v_name = strdup(type);
  481                 }
  482         }
  483         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
  484                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
  485         /*
  486          * Its ok if we don't have any kids.
  487          */
  488         if (rc == 0) {
  489                 vdev->v_nchildren = nkids;
  490                 for (i = 0; i < nkids; i++) {
  491                         rc = vdev_init_from_nvlist(kids, &kid);
  492                         if (rc)
  493                                 return (rc);
  494                         STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
  495                         kids = nvlist_next(kids);
  496                 }
  497         } else {
  498                 vdev->v_nchildren = 0;
  499         }
  500 
  501         if (vdevp)
  502                 *vdevp = vdev;
  503         return (0);
  504 }
  505 
  506 static void
  507 vdev_set_state(vdev_t *vdev)
  508 {
  509         vdev_t *kid;
  510         int good_kids;
  511         int bad_kids;
  512 
  513         /*
  514          * A mirror or raidz is healthy if all its kids are healthy. A
  515          * mirror is degraded if any of its kids is healthy; a raidz
  516          * is degraded if at most nparity kids are offline.
  517          */
  518         if (STAILQ_FIRST(&vdev->v_children)) {
  519                 good_kids = 0;
  520                 bad_kids = 0;
  521                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  522                         if (kid->v_state == VDEV_STATE_HEALTHY)
  523                                 good_kids++;
  524                         else
  525                                 bad_kids++;
  526                 }
  527                 if (bad_kids == 0) {
  528                         vdev->v_state = VDEV_STATE_HEALTHY;
  529                 } else {
  530                         if (vdev->v_read == vdev_mirror_read) {
  531                                 if (good_kids) {
  532                                         vdev->v_state = VDEV_STATE_DEGRADED;
  533                                 } else {
  534                                         vdev->v_state = VDEV_STATE_OFFLINE;
  535                                 }
  536                         } else if (vdev->v_read == vdev_raidz_read) {
  537                                 if (bad_kids > vdev->v_nparity) {
  538                                         vdev->v_state = VDEV_STATE_OFFLINE;
  539                                 } else {
  540                                         vdev->v_state = VDEV_STATE_DEGRADED;
  541                                 }
  542                         }
  543                 }
  544         }
  545 }
  546 
  547 static spa_t *
  548 spa_find_by_guid(uint64_t guid)
  549 {
  550         spa_t *spa;
  551 
  552         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  553                 if (spa->spa_guid == guid)
  554                         return (spa);
  555 
  556         return (0);
  557 }
  558 
  559 #ifdef BOOT2
  560 
  561 static spa_t *
  562 spa_find_by_name(const char *name)
  563 {
  564         spa_t *spa;
  565 
  566         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  567                 if (!strcmp(spa->spa_name, name))
  568                         return (spa);
  569 
  570         return (0);
  571 }
  572 
  573 #endif
  574 
  575 static spa_t *
  576 spa_create(uint64_t guid)
  577 {
  578         spa_t *spa;
  579 
  580         spa = malloc(sizeof(spa_t));
  581         memset(spa, 0, sizeof(spa_t));
  582         STAILQ_INIT(&spa->spa_vdevs);
  583         spa->spa_guid = guid;
  584         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
  585 
  586         return (spa);
  587 }
  588 
  589 static const char *
  590 state_name(vdev_state_t state)
  591 {
  592         static const char* names[] = {
  593                 "UNKNOWN",
  594                 "CLOSED",
  595                 "OFFLINE",
  596                 "CANT_OPEN",
  597                 "DEGRADED",
  598                 "ONLINE"
  599         };
  600         return names[state];
  601 }
  602 
  603 #ifdef BOOT2
  604 
  605 #define pager_printf printf
  606 
  607 #else
  608 
  609 static void
  610 pager_printf(const char *fmt, ...)
  611 {
  612         char line[80];
  613         va_list args;
  614 
  615         va_start(args, fmt);
  616         vsprintf(line, fmt, args);
  617         va_end(args);
  618         pager_output(line);
  619 }
  620 
  621 #endif
  622 
  623 #define STATUS_FORMAT   "        %-16s %-10s\n"
  624 
  625 static void
  626 print_state(int indent, const char *name, vdev_state_t state)
  627 {
  628         int i;
  629         char buf[512];
  630 
  631         buf[0] = 0;
  632         for (i = 0; i < indent; i++)
  633                 strcat(buf, "  ");
  634         strcat(buf, name);
  635         pager_printf(STATUS_FORMAT, buf, state_name(state));
  636         
  637 }
  638 
  639 static void
  640 vdev_status(vdev_t *vdev, int indent)
  641 {
  642         vdev_t *kid;
  643         print_state(indent, vdev->v_name, vdev->v_state);
  644 
  645         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  646                 vdev_status(kid, indent + 1);
  647         }
  648 }
  649 
  650 static void
  651 spa_status(spa_t *spa)
  652 {
  653         vdev_t *vdev;
  654         int good_kids, bad_kids, degraded_kids;
  655         vdev_state_t state;
  656 
  657         pager_printf("  pool: %s\n", spa->spa_name);
  658         pager_printf("config:\n\n");
  659         pager_printf(STATUS_FORMAT, "NAME", "STATE");
  660 
  661         good_kids = 0;
  662         degraded_kids = 0;
  663         bad_kids = 0;
  664         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  665                 if (vdev->v_state == VDEV_STATE_HEALTHY)
  666                         good_kids++;
  667                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
  668                         degraded_kids++;
  669                 else
  670                         bad_kids++;
  671         }
  672 
  673         state = VDEV_STATE_CLOSED;
  674         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
  675                 state = VDEV_STATE_HEALTHY;
  676         else if ((good_kids + degraded_kids) > 0)
  677                 state = VDEV_STATE_DEGRADED;
  678 
  679         print_state(0, spa->spa_name, state);
  680         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  681                 vdev_status(vdev, 1);
  682         }
  683 }
  684 
  685 static void
  686 spa_all_status(void)
  687 {
  688         spa_t *spa;
  689         int first = 1;
  690 
  691         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
  692                 if (!first)
  693                         pager_printf("\n");
  694                 first = 0;
  695                 spa_status(spa);
  696         }
  697 }
  698 
  699 static int
  700 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
  701 {
  702         vdev_t vtmp;
  703         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
  704         spa_t *spa;
  705         vdev_t *vdev, *top_vdev, *pool_vdev;
  706         off_t off;
  707         blkptr_t bp;
  708         const unsigned char *nvlist;
  709         uint64_t val;
  710         uint64_t guid;
  711         uint64_t pool_txg, pool_guid;
  712         const char *pool_name;
  713         const unsigned char *vdevs;
  714         int i, rc;
  715         char upbuf[1024];
  716         const struct uberblock *up;
  717 
  718         /*
  719          * Load the vdev label and figure out which
  720          * uberblock is most current.
  721          */
  722         memset(&vtmp, 0, sizeof(vtmp));
  723         vtmp.v_phys_read = read;
  724         vtmp.v_read_priv = read_priv;
  725         off = offsetof(vdev_label_t, vl_vdev_phys);
  726         BP_ZERO(&bp);
  727         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
  728         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
  729         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  730         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  731         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  732         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
  733                 return (EIO);
  734 
  735         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
  736                 return (EIO);
  737         }
  738 
  739         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
  740 
  741         if (nvlist_find(nvlist,
  742                         ZPOOL_CONFIG_VERSION,
  743                         DATA_TYPE_UINT64, 0, &val)) {
  744                 return (EIO);
  745         }
  746 
  747         if (val > SPA_VERSION) {
  748                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
  749                     (unsigned) val, (unsigned) SPA_VERSION);
  750                 return (EIO);
  751         }
  752 
  753         if (nvlist_find(nvlist,
  754                         ZPOOL_CONFIG_POOL_STATE,
  755                         DATA_TYPE_UINT64, 0, &val)) {
  756                 return (EIO);
  757         }
  758 
  759 #ifndef TEST
  760         if (val != POOL_STATE_ACTIVE) {
  761                 /*
  762                  * Don't print a message here. If we happen to reboot
  763                  * while where is an exported pool around, we don't
  764                  * need a cascade of confusing messages during boot.
  765                  */
  766                 /*printf("ZFS: pool is not active\n");*/
  767                 return (EIO);
  768         }
  769 #endif
  770 
  771         if (nvlist_find(nvlist,
  772                         ZPOOL_CONFIG_POOL_TXG,
  773                         DATA_TYPE_UINT64, 0, &pool_txg)
  774             || nvlist_find(nvlist,
  775                            ZPOOL_CONFIG_POOL_GUID,
  776                            DATA_TYPE_UINT64, 0, &pool_guid)
  777             || nvlist_find(nvlist,
  778                            ZPOOL_CONFIG_POOL_NAME,
  779                            DATA_TYPE_STRING, 0, &pool_name)) {
  780                 /*
  781                  * Cache and spare devices end up here - just ignore
  782                  * them.
  783                  */
  784                 /*printf("ZFS: can't find pool details\n");*/
  785                 return (EIO);
  786         }
  787 
  788         /*
  789          * Create the pool if this is the first time we've seen it.
  790          */
  791         spa = spa_find_by_guid(pool_guid);
  792         if (!spa) {
  793                 spa = spa_create(pool_guid);
  794                 spa->spa_name = strdup(pool_name);
  795         }
  796         if (pool_txg > spa->spa_txg)
  797                 spa->spa_txg = pool_txg;
  798 
  799         /*
  800          * Get the vdev tree and create our in-core copy of it.
  801          * If we already have a healthy vdev with this guid, this must
  802          * be some kind of alias (overlapping slices, dangerously dedicated
  803          * disks etc).
  804          */
  805         if (nvlist_find(nvlist,
  806                         ZPOOL_CONFIG_GUID,
  807                         DATA_TYPE_UINT64, 0, &guid)) {
  808                 return (EIO);
  809         }
  810         vdev = vdev_find(guid);
  811         if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
  812                 return (EIO);
  813         }
  814 
  815         if (nvlist_find(nvlist,
  816                         ZPOOL_CONFIG_VDEV_TREE,
  817                         DATA_TYPE_NVLIST, 0, &vdevs)) {
  818                 return (EIO);
  819         }
  820         rc = vdev_init_from_nvlist(vdevs, &top_vdev);
  821         if (rc)
  822                 return (rc);
  823 
  824         /*
  825          * Add the toplevel vdev to the pool if its not already there.
  826          */
  827         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
  828                 if (top_vdev == pool_vdev)
  829                         break;
  830         if (!pool_vdev && top_vdev)
  831                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
  832 
  833         /*
  834          * We should already have created an incomplete vdev for this
  835          * vdev. Find it and initialise it with our read proc.
  836          */
  837         vdev = vdev_find(guid);
  838         if (vdev) {
  839                 vdev->v_phys_read = read;
  840                 vdev->v_read_priv = read_priv;
  841                 vdev->v_state = VDEV_STATE_HEALTHY;
  842         } else {
  843                 printf("ZFS: inconsistent nvlist contents\n");
  844                 return (EIO);
  845         }
  846 
  847         /*
  848          * Re-evaluate top-level vdev state.
  849          */
  850         vdev_set_state(top_vdev);
  851 
  852         /*
  853          * Ok, we are happy with the pool so far. Lets find
  854          * the best uberblock and then we can actually access
  855          * the contents of the pool.
  856          */
  857         for (i = 0;
  858              i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
  859              i++) {
  860                 off = offsetof(vdev_label_t, vl_uberblock);
  861                 off += i << UBERBLOCK_SHIFT;
  862                 BP_ZERO(&bp);
  863                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
  864                 BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
  865                 BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
  866                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  867                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  868                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  869                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
  870                         continue;
  871 
  872                 up = (const struct uberblock *) upbuf;
  873                 if (up->ub_magic != UBERBLOCK_MAGIC)
  874                         continue;
  875                 if (up->ub_txg < spa->spa_txg)
  876                         continue;
  877                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
  878                         spa->spa_uberblock = *up;
  879                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
  880                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
  881                                 spa->spa_uberblock = *up;
  882                 }
  883         }
  884 
  885         if (spap)
  886                 *spap = spa;
  887         return (0);
  888 }
  889 
  890 static int
  891 ilog2(int n)
  892 {
  893         int v;
  894 
  895         for (v = 0; v < 32; v++)
  896                 if (n == (1 << v))
  897                         return v;
  898         return -1;
  899 }
  900 
  901 static int
  902 zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
  903 {
  904         zio_gbh_phys_t zio_gb;
  905         vdev_t *vdev;
  906         int vdevid;
  907         off_t offset;
  908         int i;
  909 
  910         vdevid = DVA_GET_VDEV(dva);
  911         offset = DVA_GET_OFFSET(dva);
  912         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
  913                 if (vdev->v_id == vdevid)
  914                         break;
  915         if (!vdev || !vdev->v_read)
  916                 return (EIO);
  917         if (vdev->v_read(vdev, bp, &zio_gb, offset, SPA_GANGBLOCKSIZE))
  918                 return (EIO);
  919 
  920         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
  921                 if (zio_read(spa, &zio_gb.zg_blkptr[i], buf))
  922                         return (EIO);
  923         }
  924  
  925         return (0);
  926 }
  927 
  928 static int
  929 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
  930 {
  931         int cpfunc = BP_GET_COMPRESS(bp);
  932         size_t lsize = BP_GET_LSIZE(bp);
  933         size_t psize = BP_GET_PSIZE(bp);
  934         void *pbuf;
  935         int i;
  936 
  937         zfs_reset_temp();
  938         if (cpfunc != ZIO_COMPRESS_OFF)
  939                 pbuf = zfs_alloc_temp(psize);
  940         else
  941                 pbuf = buf;
  942 
  943         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
  944                 const dva_t *dva = &bp->blk_dva[i];
  945                 vdev_t *vdev;
  946                 int vdevid;
  947                 off_t offset;
  948 
  949                 if (!dva->dva_word[0] && !dva->dva_word[1])
  950                         continue;
  951 
  952                 if (DVA_GET_GANG(dva)) {
  953                         printf("ZFS: gang block detected!\n");
  954                         if (zio_read_gang(spa, bp, dva, buf))
  955                                 return (EIO); 
  956                 } else {
  957                         vdevid = DVA_GET_VDEV(dva);
  958                         offset = DVA_GET_OFFSET(dva);
  959                         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
  960                                 if (vdev->v_id == vdevid)
  961                                         break;
  962                         if (!vdev || !vdev->v_read) {
  963                                 continue;
  964                         }
  965                         if (vdev->v_read(vdev, bp, pbuf, offset, psize))
  966                                 continue;
  967 
  968                         if (cpfunc != ZIO_COMPRESS_OFF) {
  969                                 if (zio_decompress_data(cpfunc, pbuf, psize,
  970                                     buf, lsize))
  971                                         return (EIO);
  972                         }
  973                 }
  974 
  975                 return (0);
  976         }
  977         printf("ZFS: i/o error - all block copies unavailable\n");
  978 
  979         return (EIO);
  980 }
  981 
  982 static int
  983 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
  984 {
  985         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
  986         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
  987         int nlevels = dnode->dn_nlevels;
  988         int i, rc;
  989 
  990         /*
  991          * Note: bsize may not be a power of two here so we need to do an
  992          * actual divide rather than a bitshift.
  993          */
  994         while (buflen > 0) {
  995                 uint64_t bn = offset / bsize;
  996                 int boff = offset % bsize;
  997                 int ibn;
  998                 const blkptr_t *indbp;
  999                 blkptr_t bp;
 1000 
 1001                 if (bn > dnode->dn_maxblkid)
 1002                         return (EIO);
 1003 
 1004                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 1005                         goto cached;
 1006 
 1007                 indbp = dnode->dn_blkptr;
 1008                 for (i = 0; i < nlevels; i++) {
 1009                         /*
 1010                          * Copy the bp from the indirect array so that
 1011                          * we can re-use the scratch buffer for multi-level
 1012                          * objects.
 1013                          */
 1014                         ibn = bn >> ((nlevels - i - 1) * ibshift);
 1015                         ibn &= ((1 << ibshift) - 1);
 1016                         bp = indbp[ibn];
 1017                         rc = zio_read(spa, &bp, dnode_cache_buf);
 1018                         if (rc)
 1019                                 return (rc);
 1020                         indbp = (const blkptr_t *) dnode_cache_buf;
 1021                 }
 1022                 dnode_cache_obj = dnode;
 1023                 dnode_cache_bn = bn;
 1024         cached:
 1025 
 1026                 /*
 1027                  * The buffer contains our data block. Copy what we
 1028                  * need from it and loop.
 1029                  */ 
 1030                 i = bsize - boff;
 1031                 if (i > buflen) i = buflen;
 1032                 memcpy(buf, &dnode_cache_buf[boff], i);
 1033                 buf = ((char*) buf) + i;
 1034                 offset += i;
 1035                 buflen -= i;
 1036         }
 1037 
 1038         return (0);
 1039 }
 1040 
 1041 /*
 1042  * Lookup a value in a microzap directory. Assumes that the zap
 1043  * scratch buffer contains the directory contents.
 1044  */
 1045 static int
 1046 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1047 {
 1048         const mzap_phys_t *mz;
 1049         const mzap_ent_phys_t *mze;
 1050         size_t size;
 1051         int chunks, i;
 1052 
 1053         /*
 1054          * Microzap objects use exactly one block. Read the whole
 1055          * thing.
 1056          */
 1057         size = dnode->dn_datablkszsec * 512;
 1058 
 1059         mz = (const mzap_phys_t *) zap_scratch;
 1060         chunks = size / MZAP_ENT_LEN - 1;
 1061 
 1062         for (i = 0; i < chunks; i++) {
 1063                 mze = &mz->mz_chunk[i];
 1064                 if (!strcmp(mze->mze_name, name)) {
 1065                         *value = mze->mze_value;
 1066                         return (0);
 1067                 }
 1068         }
 1069 
 1070         return (ENOENT);
 1071 }
 1072 
 1073 /*
 1074  * Compare a name with a zap leaf entry. Return non-zero if the name
 1075  * matches.
 1076  */
 1077 static int
 1078 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 1079 {
 1080         size_t namelen;
 1081         const zap_leaf_chunk_t *nc;
 1082         const char *p;
 1083 
 1084         namelen = zc->l_entry.le_name_length;
 1085                         
 1086         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1087         p = name;
 1088         while (namelen > 0) {
 1089                 size_t len;
 1090                 len = namelen;
 1091                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1092                         len = ZAP_LEAF_ARRAY_BYTES;
 1093                 if (memcmp(p, nc->l_array.la_array, len))
 1094                         return (0);
 1095                 p += len;
 1096                 namelen -= len;
 1097                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1098         }
 1099 
 1100         return 1;
 1101 }
 1102 
 1103 /*
 1104  * Extract a uint64_t value from a zap leaf entry.
 1105  */
 1106 static uint64_t
 1107 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 1108 {
 1109         const zap_leaf_chunk_t *vc;
 1110         int i;
 1111         uint64_t value;
 1112         const uint8_t *p;
 1113 
 1114         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 1115         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 1116                 value = (value << 8) | p[i];
 1117         }
 1118 
 1119         return value;
 1120 }
 1121 
 1122 /*
 1123  * Lookup a value in a fatzap directory. Assumes that the zap scratch
 1124  * buffer contains the directory header.
 1125  */
 1126 static int
 1127 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1128 {
 1129         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1130         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1131         fat_zap_t z;
 1132         uint64_t *ptrtbl;
 1133         uint64_t hash;
 1134         int rc;
 1135 
 1136         if (zh.zap_magic != ZAP_MAGIC)
 1137                 return (EIO);
 1138 
 1139         z.zap_block_shift = ilog2(bsize);
 1140         z.zap_phys = (zap_phys_t *) zap_scratch;
 1141 
 1142         /*
 1143          * Figure out where the pointer table is and read it in if necessary.
 1144          */
 1145         if (zh.zap_ptrtbl.zt_blk) {
 1146                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1147                                zap_scratch, bsize);
 1148                 if (rc)
 1149                         return (rc);
 1150                 ptrtbl = (uint64_t *) zap_scratch;
 1151         } else {
 1152                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1153         }
 1154 
 1155         hash = zap_hash(zh.zap_salt, name);
 1156 
 1157         zap_leaf_t zl;
 1158         zl.l_bs = z.zap_block_shift;
 1159 
 1160         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1161         zap_leaf_chunk_t *zc;
 1162 
 1163         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1164         if (rc)
 1165                 return (rc);
 1166 
 1167         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1168 
 1169         /*
 1170          * Make sure this chunk matches our hash.
 1171          */
 1172         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1173             && zl.l_phys->l_hdr.lh_prefix
 1174             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1175                 return (ENOENT);
 1176 
 1177         /*
 1178          * Hash within the chunk to find our entry.
 1179          */
 1180         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1181         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1182         h = zl.l_phys->l_hash[h];
 1183         if (h == 0xffff)
 1184                 return (ENOENT);
 1185         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1186         while (zc->l_entry.le_hash != hash) {
 1187                 if (zc->l_entry.le_next == 0xffff) {
 1188                         zc = 0;
 1189                         break;
 1190                 }
 1191                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1192         }
 1193         if (fzap_name_equal(&zl, zc, name)) {
 1194                 *value = fzap_leaf_value(&zl, zc);
 1195                 return (0);
 1196         }
 1197 
 1198         return (ENOENT);
 1199 }
 1200 
 1201 /*
 1202  * Lookup a name in a zap object and return its value as a uint64_t.
 1203  */
 1204 static int
 1205 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1206 {
 1207         int rc;
 1208         uint64_t zap_type;
 1209         size_t size = dnode->dn_datablkszsec * 512;
 1210 
 1211         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1212         if (rc)
 1213                 return (rc);
 1214 
 1215         zap_type = *(uint64_t *) zap_scratch;
 1216         if (zap_type == ZBT_MICRO)
 1217                 return mzap_lookup(spa, dnode, name, value);
 1218         else
 1219                 return fzap_lookup(spa, dnode, name, value);
 1220 }
 1221 
 1222 #ifdef BOOT2
 1223 
 1224 /*
 1225  * List a microzap directory. Assumes that the zap scratch buffer contains
 1226  * the directory contents.
 1227  */
 1228 static int
 1229 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1230 {
 1231         const mzap_phys_t *mz;
 1232         const mzap_ent_phys_t *mze;
 1233         size_t size;
 1234         int chunks, i;
 1235 
 1236         /*
 1237          * Microzap objects use exactly one block. Read the whole
 1238          * thing.
 1239          */
 1240         size = dnode->dn_datablkszsec * 512;
 1241         mz = (const mzap_phys_t *) zap_scratch;
 1242         chunks = size / MZAP_ENT_LEN - 1;
 1243 
 1244         for (i = 0; i < chunks; i++) {
 1245                 mze = &mz->mz_chunk[i];
 1246                 if (mze->mze_name[0])
 1247                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
 1248                         printf("%s\n", mze->mze_name);
 1249         }
 1250 
 1251         return (0);
 1252 }
 1253 
 1254 /*
 1255  * List a fatzap directory. Assumes that the zap scratch buffer contains
 1256  * the directory header.
 1257  */
 1258 static int
 1259 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1260 {
 1261         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1262         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1263         fat_zap_t z;
 1264         int i, j;
 1265 
 1266         if (zh.zap_magic != ZAP_MAGIC)
 1267                 return (EIO);
 1268 
 1269         z.zap_block_shift = ilog2(bsize);
 1270         z.zap_phys = (zap_phys_t *) zap_scratch;
 1271 
 1272         /*
 1273          * This assumes that the leaf blocks start at block 1. The
 1274          * documentation isn't exactly clear on this.
 1275          */
 1276         zap_leaf_t zl;
 1277         zl.l_bs = z.zap_block_shift;
 1278         for (i = 0; i < zh.zap_num_leafs; i++) {
 1279                 off_t off = (i + 1) << zl.l_bs;
 1280                 char name[256], *p;
 1281                 uint64_t value;
 1282 
 1283                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 1284                         return (EIO);
 1285 
 1286                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1287 
 1288                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 1289                         zap_leaf_chunk_t *zc, *nc;
 1290                         int namelen;
 1291 
 1292                         zc = &ZAP_LEAF_CHUNK(&zl, j);
 1293                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 1294                                 continue;
 1295                         namelen = zc->l_entry.le_name_length;
 1296                         if (namelen > sizeof(name))
 1297                                 namelen = sizeof(name);
 1298                         
 1299                         /*
 1300                          * Paste the name back together.
 1301                          */
 1302                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 1303                         p = name;
 1304                         while (namelen > 0) {
 1305                                 int len;
 1306                                 len = namelen;
 1307                                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1308                                         len = ZAP_LEAF_ARRAY_BYTES;
 1309                                 memcpy(p, nc->l_array.la_array, len);
 1310                                 p += len;
 1311                                 namelen -= len;
 1312                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 1313                         }
 1314 
 1315                         /*
 1316                          * Assume the first eight bytes of the value are
 1317                          * a uint64_t.
 1318                          */
 1319                         value = fzap_leaf_value(&zl, zc);
 1320 
 1321                         printf("%-32s 0x%llx\n", name, value);
 1322                 }
 1323         }
 1324 
 1325         return (0);
 1326 }
 1327 
 1328 /*
 1329  * List a zap directory.
 1330  */
 1331 static int
 1332 zap_list(spa_t *spa, const dnode_phys_t *dnode)
 1333 {
 1334         uint64_t zap_type;
 1335         size_t size = dnode->dn_datablkszsec * 512;
 1336 
 1337         if (dnode_read(spa, dnode, 0, zap_scratch, size))
 1338                 return (EIO);
 1339 
 1340         zap_type = *(uint64_t *) zap_scratch;
 1341         if (zap_type == ZBT_MICRO)
 1342                 return mzap_list(spa, dnode);
 1343         else
 1344                 return fzap_list(spa, dnode);
 1345 }
 1346 
 1347 #endif
 1348 
 1349 static int
 1350 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 1351 {
 1352         off_t offset;
 1353 
 1354         offset = objnum * sizeof(dnode_phys_t);
 1355         return dnode_read(spa, &os->os_meta_dnode, offset,
 1356                 dnode, sizeof(dnode_phys_t));
 1357 }
 1358 
 1359 /*
 1360  * Find the object set given the object number of its dataset object
 1361  * and return its details in *objset
 1362  */
 1363 static int
 1364 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 1365 {
 1366         dnode_phys_t dataset;
 1367         dsl_dataset_phys_t *ds;
 1368 
 1369         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1370                 printf("ZFS: can't find dataset %llu\n", objnum);
 1371                 return (EIO);
 1372         }
 1373 
 1374         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 1375         if (zio_read(spa, &ds->ds_bp, objset)) {
 1376                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
 1377                 return (EIO);
 1378         }
 1379 
 1380         return (0);
 1381 }
 1382 
 1383 /*
 1384  * Find the object set pointed to by the BOOTFS property or the root
 1385  * dataset if there is none and return its details in *objset
 1386  */
 1387 static int
 1388 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
 1389 {
 1390         dnode_phys_t dir, propdir;
 1391         uint64_t props, bootfs, root;
 1392 
 1393         /*
 1394          * Start with the MOS directory object.
 1395          */
 1396         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 1397                 printf("ZFS: can't read MOS object directory\n");
 1398                 return (EIO);
 1399         }
 1400 
 1401         /*
 1402          * Lookup the pool_props and see if we can find a bootfs.
 1403          */
 1404         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
 1405              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 1406              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
 1407              && bootfs != 0)
 1408                 return zfs_mount_dataset(spa, bootfs, objset);
 1409 
 1410         /*
 1411          * Lookup the root dataset directory
 1412          */
 1413         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
 1414             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 1415                 printf("ZFS: can't find root dsl_dir\n");
 1416                 return (EIO);
 1417         }
 1418 
 1419         /*
 1420          * Use the information from the dataset directory's bonus buffer
 1421          * to find the dataset object and from that the object set itself.
 1422          */
 1423         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 1424         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
 1425 }
 1426 
 1427 static int
 1428 zfs_mount_pool(spa_t *spa)
 1429 {
 1430         /*
 1431          * Find the MOS and work our way in from there.
 1432          */
 1433         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 1434                 printf("ZFS: can't read MOS\n");
 1435                 return (EIO);
 1436         }
 1437 
 1438         /*
 1439          * Find the root object set
 1440          */
 1441         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
 1442                 printf("Can't find root filesystem - giving up\n");
 1443                 return (EIO);
 1444         }
 1445 
 1446         return (0);
 1447 }
 1448 
 1449 /*
 1450  * Lookup a file and return its dnode.
 1451  */
 1452 static int
 1453 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
 1454 {
 1455         int rc;
 1456         uint64_t objnum, rootnum, parentnum;
 1457         dnode_phys_t dn;
 1458         const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
 1459         const char *p, *q;
 1460         char element[256];
 1461         char path[1024];
 1462         int symlinks_followed = 0;
 1463 
 1464         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
 1465                 printf("ZFS: unexpected object set type %llu\n",
 1466                        spa->spa_root_objset.os_type);
 1467                 return (EIO);
 1468         }
 1469 
 1470         /*
 1471          * Get the root directory dnode.
 1472          */
 1473         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
 1474         if (rc)
 1475                 return (rc);
 1476 
 1477         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
 1478         if (rc)
 1479                 return (rc);
 1480 
 1481         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
 1482         if (rc)
 1483                 return (rc);
 1484 
 1485         objnum = rootnum;
 1486         p = upath;
 1487         while (p && *p) {
 1488                 while (*p == '/')
 1489                         p++;
 1490                 if (!*p)
 1491                         break;
 1492                 q = strchr(p, '/');
 1493                 if (q) {
 1494                         memcpy(element, p, q - p);
 1495                         element[q - p] = 0;
 1496                         p = q;
 1497                 } else {
 1498                         strcpy(element, p);
 1499                         p = 0;
 1500                 }
 1501 
 1502                 if ((zp->zp_mode >> 12) != 0x4) {
 1503                         return (ENOTDIR);
 1504                 }
 1505 
 1506                 parentnum = objnum;
 1507                 rc = zap_lookup(spa, &dn, element, &objnum);
 1508                 if (rc)
 1509                         return (rc);
 1510                 objnum = ZFS_DIRENT_OBJ(objnum);
 1511 
 1512                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1513                 if (rc)
 1514                         return (rc);
 1515 
 1516                 /*
 1517                  * Check for symlink.
 1518                  */
 1519                 if ((zp->zp_mode >> 12) == 0xa) {
 1520                         if (symlinks_followed > 10)
 1521                                 return (EMLINK);
 1522                         symlinks_followed++;
 1523 
 1524                         /*
 1525                          * Read the link value and copy the tail of our
 1526                          * current path onto the end.
 1527                          */
 1528                         if (p)
 1529                                 strcpy(&path[zp->zp_size], p);
 1530                         else
 1531                                 path[zp->zp_size] = 0;
 1532                         if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
 1533                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 1534                                         zp->zp_size);
 1535                         } else {
 1536                                 rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
 1537                                 if (rc)
 1538                                         return (rc);
 1539                         }
 1540 
 1541                         /*
 1542                          * Restart with the new path, starting either at
 1543                          * the root or at the parent depending whether or
 1544                          * not the link is relative.
 1545                          */
 1546                         p = path;
 1547                         if (*p == '/')
 1548                                 objnum = rootnum;
 1549                         else
 1550                                 objnum = parentnum;
 1551                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1552                 }
 1553         }
 1554 
 1555         *dnode = dn;
 1556         return (0);
 1557 }

Cache object: fb64e073079df57076a9a9d9ceec119c


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.