The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2007 Doug Rabson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/9.0/sys/boot/zfs/zfsimpl.c 227710 2011-11-19 11:47:15Z pjd $");
   29 
   30 /*
   31  *      Stand-alone ZFS file reader.
   32  */
   33 
   34 #include <sys/stat.h>
   35 
   36 #include "zfsimpl.h"
   37 #include "zfssubr.c"
   38 
   39 /*
   40  * List of all vdevs, chained through v_alllink.
   41  */
   42 static vdev_list_t zfs_vdevs;
   43 
   44 /*
   45  * List of all pools, chained through spa_link.
   46  */
   47 static spa_list_t zfs_pools;
   48 
   49 static uint64_t zfs_crc64_table[256];
   50 static const dnode_phys_t *dnode_cache_obj = 0;
   51 static uint64_t dnode_cache_bn;
   52 static char *dnode_cache_buf;
   53 static char *zap_scratch;
   54 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
   55 
   56 #define TEMP_SIZE       (1024 * 1024)
   57 
   58 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
   59 
   60 static void
   61 zfs_init(void)
   62 {
   63         STAILQ_INIT(&zfs_vdevs);
   64         STAILQ_INIT(&zfs_pools);
   65 
   66         zfs_temp_buf = malloc(TEMP_SIZE);
   67         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
   68         zfs_temp_ptr = zfs_temp_buf;
   69         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
   70         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
   71 
   72         zfs_init_crc();
   73 }
   74 
   75 static void *
   76 zfs_alloc(size_t size)
   77 {
   78         char *ptr;
   79 
   80         if (zfs_temp_ptr + size > zfs_temp_end) {
   81                 printf("ZFS: out of temporary buffer space\n");
   82                 for (;;) ;
   83         }
   84         ptr = zfs_temp_ptr;
   85         zfs_temp_ptr += size;
   86 
   87         return (ptr);
   88 }
   89 
   90 static void
   91 zfs_free(void *ptr, size_t size)
   92 {
   93 
   94         zfs_temp_ptr -= size;
   95         if (zfs_temp_ptr != ptr) {
   96                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
   97                 for (;;) ;
   98         }
   99 }
  100 
  101 static int
  102 xdr_int(const unsigned char **xdr, int *ip)
  103 {
  104         *ip = ((*xdr)[0] << 24)
  105                 | ((*xdr)[1] << 16)
  106                 | ((*xdr)[2] << 8)
  107                 | ((*xdr)[3] << 0);
  108         (*xdr) += 4;
  109         return (0);
  110 }
  111 
  112 static int
  113 xdr_u_int(const unsigned char **xdr, u_int *ip)
  114 {
  115         *ip = ((*xdr)[0] << 24)
  116                 | ((*xdr)[1] << 16)
  117                 | ((*xdr)[2] << 8)
  118                 | ((*xdr)[3] << 0);
  119         (*xdr) += 4;
  120         return (0);
  121 }
  122 
  123 static int
  124 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
  125 {
  126         u_int hi, lo;
  127 
  128         xdr_u_int(xdr, &hi);
  129         xdr_u_int(xdr, &lo);
  130         *lp = (((uint64_t) hi) << 32) | lo;
  131         return (0);
  132 }
  133 
  134 static int
  135 nvlist_find(const unsigned char *nvlist, const char *name, int type,
  136             int* elementsp, void *valuep)
  137 {
  138         const unsigned char *p, *pair;
  139         int junk;
  140         int encoded_size, decoded_size;
  141 
  142         p = nvlist;
  143         xdr_int(&p, &junk);
  144         xdr_int(&p, &junk);
  145 
  146         pair = p;
  147         xdr_int(&p, &encoded_size);
  148         xdr_int(&p, &decoded_size);
  149         while (encoded_size && decoded_size) {
  150                 int namelen, pairtype, elements;
  151                 const char *pairname;
  152 
  153                 xdr_int(&p, &namelen);
  154                 pairname = (const char*) p;
  155                 p += roundup(namelen, 4);
  156                 xdr_int(&p, &pairtype);
  157 
  158                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
  159                         xdr_int(&p, &elements);
  160                         if (elementsp)
  161                                 *elementsp = elements;
  162                         if (type == DATA_TYPE_UINT64) {
  163                                 xdr_uint64_t(&p, (uint64_t *) valuep);
  164                                 return (0);
  165                         } else if (type == DATA_TYPE_STRING) {
  166                                 int len;
  167                                 xdr_int(&p, &len);
  168                                 (*(const char**) valuep) = (const char*) p;
  169                                 return (0);
  170                         } else if (type == DATA_TYPE_NVLIST
  171                                    || type == DATA_TYPE_NVLIST_ARRAY) {
  172                                 (*(const unsigned char**) valuep) =
  173                                          (const unsigned char*) p;
  174                                 return (0);
  175                         } else {
  176                                 return (EIO);
  177                         }
  178                 } else {
  179                         /*
  180                          * Not the pair we are looking for, skip to the next one.
  181                          */
  182                         p = pair + encoded_size;
  183                 }
  184 
  185                 pair = p;
  186                 xdr_int(&p, &encoded_size);
  187                 xdr_int(&p, &decoded_size);
  188         }
  189 
  190         return (EIO);
  191 }
  192 
  193 /*
  194  * Return the next nvlist in an nvlist array.
  195  */
  196 static const unsigned char *
  197 nvlist_next(const unsigned char *nvlist)
  198 {
  199         const unsigned char *p, *pair;
  200         int junk;
  201         int encoded_size, decoded_size;
  202 
  203         p = nvlist;
  204         xdr_int(&p, &junk);
  205         xdr_int(&p, &junk);
  206 
  207         pair = p;
  208         xdr_int(&p, &encoded_size);
  209         xdr_int(&p, &decoded_size);
  210         while (encoded_size && decoded_size) {
  211                 p = pair + encoded_size;
  212 
  213                 pair = p;
  214                 xdr_int(&p, &encoded_size);
  215                 xdr_int(&p, &decoded_size);
  216         }
  217 
  218         return p;
  219 }
  220 
  221 #ifdef TEST
  222 
  223 static const unsigned char *
  224 nvlist_print(const unsigned char *nvlist, unsigned int indent)
  225 {
  226         static const char* typenames[] = {
  227                 "DATA_TYPE_UNKNOWN",
  228                 "DATA_TYPE_BOOLEAN",
  229                 "DATA_TYPE_BYTE",
  230                 "DATA_TYPE_INT16",
  231                 "DATA_TYPE_UINT16",
  232                 "DATA_TYPE_INT32",
  233                 "DATA_TYPE_UINT32",
  234                 "DATA_TYPE_INT64",
  235                 "DATA_TYPE_UINT64",
  236                 "DATA_TYPE_STRING",
  237                 "DATA_TYPE_BYTE_ARRAY",
  238                 "DATA_TYPE_INT16_ARRAY",
  239                 "DATA_TYPE_UINT16_ARRAY",
  240                 "DATA_TYPE_INT32_ARRAY",
  241                 "DATA_TYPE_UINT32_ARRAY",
  242                 "DATA_TYPE_INT64_ARRAY",
  243                 "DATA_TYPE_UINT64_ARRAY",
  244                 "DATA_TYPE_STRING_ARRAY",
  245                 "DATA_TYPE_HRTIME",
  246                 "DATA_TYPE_NVLIST",
  247                 "DATA_TYPE_NVLIST_ARRAY",
  248                 "DATA_TYPE_BOOLEAN_VALUE",
  249                 "DATA_TYPE_INT8",
  250                 "DATA_TYPE_UINT8",
  251                 "DATA_TYPE_BOOLEAN_ARRAY",
  252                 "DATA_TYPE_INT8_ARRAY",
  253                 "DATA_TYPE_UINT8_ARRAY"
  254         };
  255 
  256         unsigned int i, j;
  257         const unsigned char *p, *pair;
  258         int junk;
  259         int encoded_size, decoded_size;
  260 
  261         p = nvlist;
  262         xdr_int(&p, &junk);
  263         xdr_int(&p, &junk);
  264 
  265         pair = p;
  266         xdr_int(&p, &encoded_size);
  267         xdr_int(&p, &decoded_size);
  268         while (encoded_size && decoded_size) {
  269                 int namelen, pairtype, elements;
  270                 const char *pairname;
  271 
  272                 xdr_int(&p, &namelen);
  273                 pairname = (const char*) p;
  274                 p += roundup(namelen, 4);
  275                 xdr_int(&p, &pairtype);
  276 
  277                 for (i = 0; i < indent; i++)
  278                         printf(" ");
  279                 printf("%s %s", typenames[pairtype], pairname);
  280 
  281                 xdr_int(&p, &elements);
  282                 switch (pairtype) {
  283                 case DATA_TYPE_UINT64: {
  284                         uint64_t val;
  285                         xdr_uint64_t(&p, &val);
  286                         printf(" = 0x%llx\n", val);
  287                         break;
  288                 }
  289 
  290                 case DATA_TYPE_STRING: {
  291                         int len;
  292                         xdr_int(&p, &len);
  293                         printf(" = \"%s\"\n", p);
  294                         break;
  295                 }
  296 
  297                 case DATA_TYPE_NVLIST:
  298                         printf("\n");
  299                         nvlist_print(p, indent + 1);
  300                         break;
  301 
  302                 case DATA_TYPE_NVLIST_ARRAY:
  303                         for (j = 0; j < elements; j++) {
  304                                 printf("[%d]\n", j);
  305                                 p = nvlist_print(p, indent + 1);
  306                                 if (j != elements - 1) {
  307                                         for (i = 0; i < indent; i++)
  308                                                 printf(" ");
  309                                         printf("%s %s", typenames[pairtype], pairname);
  310                                 }
  311                         }
  312                         break;
  313 
  314                 default:
  315                         printf("\n");
  316                 }
  317 
  318                 p = pair + encoded_size;
  319 
  320                 pair = p;
  321                 xdr_int(&p, &encoded_size);
  322                 xdr_int(&p, &decoded_size);
  323         }
  324 
  325         return p;
  326 }
  327 
  328 #endif
  329 
  330 static int
  331 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
  332     off_t offset, size_t size)
  333 {
  334         size_t psize;
  335         int rc;
  336 
  337         if (!vdev->v_phys_read)
  338                 return (EIO);
  339 
  340         if (bp) {
  341                 psize = BP_GET_PSIZE(bp);
  342         } else {
  343                 psize = size;
  344         }
  345 
  346         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
  347         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
  348         if (rc)
  349                 return (rc);
  350         if (bp && zio_checksum_verify(bp, buf))
  351                 return (EIO);
  352 
  353         return (0);
  354 }
  355 
  356 static int
  357 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  358     off_t offset, size_t bytes)
  359 {
  360 
  361         return (vdev_read_phys(vdev, bp, buf,
  362                 offset + VDEV_LABEL_START_SIZE, bytes));
  363 }
  364 
  365 
  366 static int
  367 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  368     off_t offset, size_t bytes)
  369 {
  370         vdev_t *kid;
  371         int rc;
  372 
  373         rc = EIO;
  374         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  375                 if (kid->v_state != VDEV_STATE_HEALTHY)
  376                         continue;
  377                 rc = kid->v_read(kid, bp, buf, offset, bytes);
  378                 if (!rc)
  379                         return (0);
  380         }
  381 
  382         return (rc);
  383 }
  384 
  385 static int
  386 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  387     off_t offset, size_t bytes)
  388 {
  389         vdev_t *kid;
  390 
  391         /*
  392          * Here we should have two kids:
  393          * First one which is the one we are replacing and we can trust
  394          * only this one to have valid data, but it might not be present.
  395          * Second one is that one we are replacing with. It is most likely
  396          * healthy, but we can't trust it has needed data, so we won't use it.
  397          */
  398         kid = STAILQ_FIRST(&vdev->v_children);
  399         if (kid == NULL)
  400                 return (EIO);
  401         if (kid->v_state != VDEV_STATE_HEALTHY)
  402                 return (EIO);
  403         return (kid->v_read(kid, bp, buf, offset, bytes));
  404 }
  405 
  406 static vdev_t *
  407 vdev_find(uint64_t guid)
  408 {
  409         vdev_t *vdev;
  410 
  411         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
  412                 if (vdev->v_guid == guid)
  413                         return (vdev);
  414 
  415         return (0);
  416 }
  417 
  418 static vdev_t *
  419 vdev_create(uint64_t guid, vdev_read_t *read)
  420 {
  421         vdev_t *vdev;
  422 
  423         vdev = malloc(sizeof(vdev_t));
  424         memset(vdev, 0, sizeof(vdev_t));
  425         STAILQ_INIT(&vdev->v_children);
  426         vdev->v_guid = guid;
  427         vdev->v_state = VDEV_STATE_OFFLINE;
  428         vdev->v_read = read;
  429         vdev->v_phys_read = 0;
  430         vdev->v_read_priv = 0;
  431         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
  432 
  433         return (vdev);
  434 }
  435 
  436 static int
  437 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
  438     vdev_t **vdevp, int is_newer)
  439 {
  440         int rc;
  441         uint64_t guid, id, ashift, nparity;
  442         const char *type;
  443         const char *path;
  444         vdev_t *vdev, *kid;
  445         const unsigned char *kids;
  446         int nkids, i, is_new;
  447         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
  448 
  449         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
  450                         DATA_TYPE_UINT64, 0, &guid)
  451             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
  452                            DATA_TYPE_UINT64, 0, &id)
  453             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
  454                            DATA_TYPE_STRING, 0, &type)) {
  455                 printf("ZFS: can't find vdev details\n");
  456                 return (ENOENT);
  457         }
  458 
  459         if (strcmp(type, VDEV_TYPE_MIRROR)
  460             && strcmp(type, VDEV_TYPE_DISK)
  461             && strcmp(type, VDEV_TYPE_RAIDZ)
  462             && strcmp(type, VDEV_TYPE_REPLACING)) {
  463                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  464                 return (EIO);
  465         }
  466 
  467         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
  468 
  469         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
  470                         &is_offline);
  471         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
  472                         &is_removed);
  473         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
  474                         &is_faulted);
  475         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
  476                         &is_degraded);
  477         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
  478                         &isnt_present);
  479 
  480         vdev = vdev_find(guid);
  481         if (!vdev) {
  482                 is_new = 1;
  483 
  484                 if (!strcmp(type, VDEV_TYPE_MIRROR))
  485                         vdev = vdev_create(guid, vdev_mirror_read);
  486                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
  487                         vdev = vdev_create(guid, vdev_raidz_read);
  488                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
  489                         vdev = vdev_create(guid, vdev_replacing_read);
  490                 else
  491                         vdev = vdev_create(guid, vdev_disk_read);
  492 
  493                 vdev->v_id = id;
  494                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
  495                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
  496                         DATA_TYPE_UINT64, 0, &ashift) == 0)
  497                         vdev->v_ashift = ashift;
  498                 else
  499                         vdev->v_ashift = 0;
  500                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
  501                         DATA_TYPE_UINT64, 0, &nparity) == 0)
  502                         vdev->v_nparity = nparity;
  503                 else
  504                         vdev->v_nparity = 0;
  505                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
  506                                 DATA_TYPE_STRING, 0, &path) == 0) {
  507                         if (strncmp(path, "/dev/", 5) == 0)
  508                                 path += 5;
  509                         vdev->v_name = strdup(path);
  510                 } else {
  511                         if (!strcmp(type, "raidz")) {
  512                                 if (vdev->v_nparity == 1)
  513                                         vdev->v_name = "raidz1";
  514                                 else if (vdev->v_nparity == 2)
  515                                         vdev->v_name = "raidz2";
  516                                 else if (vdev->v_nparity == 3)
  517                                         vdev->v_name = "raidz3";
  518                                 else {
  519                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  520                                         return (EIO);
  521                                 }
  522                         } else {
  523                                 vdev->v_name = strdup(type);
  524                         }
  525                 }
  526         } else {
  527                 is_new = 0;
  528         }
  529 
  530         if (is_new || is_newer) {
  531                 /*
  532                  * This is either new vdev or we've already seen this vdev,
  533                  * but from an older vdev label, so let's refresh its state
  534                  * from the newer label.
  535                  */
  536                 if (is_offline)
  537                         vdev->v_state = VDEV_STATE_OFFLINE;
  538                 else if (is_removed)
  539                         vdev->v_state = VDEV_STATE_REMOVED;
  540                 else if (is_faulted)
  541                         vdev->v_state = VDEV_STATE_FAULTED;
  542                 else if (is_degraded)
  543                         vdev->v_state = VDEV_STATE_DEGRADED;
  544                 else if (isnt_present)
  545                         vdev->v_state = VDEV_STATE_CANT_OPEN;
  546         }
  547 
  548         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
  549                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
  550         /*
  551          * Its ok if we don't have any kids.
  552          */
  553         if (rc == 0) {
  554                 vdev->v_nchildren = nkids;
  555                 for (i = 0; i < nkids; i++) {
  556                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
  557                         if (rc)
  558                                 return (rc);
  559                         if (is_new)
  560                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
  561                                                    v_childlink);
  562                         kids = nvlist_next(kids);
  563                 }
  564         } else {
  565                 vdev->v_nchildren = 0;
  566         }
  567 
  568         if (vdevp)
  569                 *vdevp = vdev;
  570         return (0);
  571 }
  572 
  573 static void
  574 vdev_set_state(vdev_t *vdev)
  575 {
  576         vdev_t *kid;
  577         int good_kids;
  578         int bad_kids;
  579 
  580         /*
  581          * A mirror or raidz is healthy if all its kids are healthy. A
  582          * mirror is degraded if any of its kids is healthy; a raidz
  583          * is degraded if at most nparity kids are offline.
  584          */
  585         if (STAILQ_FIRST(&vdev->v_children)) {
  586                 good_kids = 0;
  587                 bad_kids = 0;
  588                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  589                         if (kid->v_state == VDEV_STATE_HEALTHY)
  590                                 good_kids++;
  591                         else
  592                                 bad_kids++;
  593                 }
  594                 if (bad_kids == 0) {
  595                         vdev->v_state = VDEV_STATE_HEALTHY;
  596                 } else {
  597                         if (vdev->v_read == vdev_mirror_read) {
  598                                 if (good_kids) {
  599                                         vdev->v_state = VDEV_STATE_DEGRADED;
  600                                 } else {
  601                                         vdev->v_state = VDEV_STATE_OFFLINE;
  602                                 }
  603                         } else if (vdev->v_read == vdev_raidz_read) {
  604                                 if (bad_kids > vdev->v_nparity) {
  605                                         vdev->v_state = VDEV_STATE_OFFLINE;
  606                                 } else {
  607                                         vdev->v_state = VDEV_STATE_DEGRADED;
  608                                 }
  609                         }
  610                 }
  611         }
  612 }
  613 
  614 static spa_t *
  615 spa_find_by_guid(uint64_t guid)
  616 {
  617         spa_t *spa;
  618 
  619         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  620                 if (spa->spa_guid == guid)
  621                         return (spa);
  622 
  623         return (0);
  624 }
  625 
  626 #ifdef BOOT2
  627 
  628 static spa_t *
  629 spa_find_by_name(const char *name)
  630 {
  631         spa_t *spa;
  632 
  633         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  634                 if (!strcmp(spa->spa_name, name))
  635                         return (spa);
  636 
  637         return (0);
  638 }
  639 
  640 #endif
  641 
  642 static spa_t *
  643 spa_create(uint64_t guid)
  644 {
  645         spa_t *spa;
  646 
  647         spa = malloc(sizeof(spa_t));
  648         memset(spa, 0, sizeof(spa_t));
  649         STAILQ_INIT(&spa->spa_vdevs);
  650         spa->spa_guid = guid;
  651         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
  652 
  653         return (spa);
  654 }
  655 
  656 static const char *
  657 state_name(vdev_state_t state)
  658 {
  659         static const char* names[] = {
  660                 "UNKNOWN",
  661                 "CLOSED",
  662                 "OFFLINE",
  663                 "REMOVED",
  664                 "CANT_OPEN",
  665                 "FAULTED",
  666                 "DEGRADED",
  667                 "ONLINE"
  668         };
  669         return names[state];
  670 }
  671 
  672 #ifdef BOOT2
  673 
  674 #define pager_printf printf
  675 
  676 #else
  677 
  678 static void
  679 pager_printf(const char *fmt, ...)
  680 {
  681         char line[80];
  682         va_list args;
  683 
  684         va_start(args, fmt);
  685         vsprintf(line, fmt, args);
  686         va_end(args);
  687         pager_output(line);
  688 }
  689 
  690 #endif
  691 
  692 #define STATUS_FORMAT   "        %s %s\n"
  693 
  694 static void
  695 print_state(int indent, const char *name, vdev_state_t state)
  696 {
  697         int i;
  698         char buf[512];
  699 
  700         buf[0] = 0;
  701         for (i = 0; i < indent; i++)
  702                 strcat(buf, "  ");
  703         strcat(buf, name);
  704         pager_printf(STATUS_FORMAT, buf, state_name(state));
  705         
  706 }
  707 
  708 static void
  709 vdev_status(vdev_t *vdev, int indent)
  710 {
  711         vdev_t *kid;
  712         print_state(indent, vdev->v_name, vdev->v_state);
  713 
  714         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  715                 vdev_status(kid, indent + 1);
  716         }
  717 }
  718 
  719 static void
  720 spa_status(spa_t *spa)
  721 {
  722         vdev_t *vdev;
  723         int good_kids, bad_kids, degraded_kids;
  724         vdev_state_t state;
  725 
  726         pager_printf("  pool: %s\n", spa->spa_name);
  727         pager_printf("config:\n\n");
  728         pager_printf(STATUS_FORMAT, "NAME", "STATE");
  729 
  730         good_kids = 0;
  731         degraded_kids = 0;
  732         bad_kids = 0;
  733         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  734                 if (vdev->v_state == VDEV_STATE_HEALTHY)
  735                         good_kids++;
  736                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
  737                         degraded_kids++;
  738                 else
  739                         bad_kids++;
  740         }
  741 
  742         state = VDEV_STATE_CLOSED;
  743         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
  744                 state = VDEV_STATE_HEALTHY;
  745         else if ((good_kids + degraded_kids) > 0)
  746                 state = VDEV_STATE_DEGRADED;
  747 
  748         print_state(0, spa->spa_name, state);
  749         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  750                 vdev_status(vdev, 1);
  751         }
  752 }
  753 
  754 static void
  755 spa_all_status(void)
  756 {
  757         spa_t *spa;
  758         int first = 1;
  759 
  760         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
  761                 if (!first)
  762                         pager_printf("\n");
  763                 first = 0;
  764                 spa_status(spa);
  765         }
  766 }
  767 
  768 static int
  769 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
  770 {
  771         vdev_t vtmp;
  772         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
  773         spa_t *spa;
  774         vdev_t *vdev, *top_vdev, *pool_vdev;
  775         off_t off;
  776         blkptr_t bp;
  777         const unsigned char *nvlist;
  778         uint64_t val;
  779         uint64_t guid;
  780         uint64_t pool_txg, pool_guid;
  781         uint64_t is_log;
  782         const char *pool_name;
  783         const unsigned char *vdevs;
  784         int i, rc, is_newer;
  785         char *upbuf;
  786         const struct uberblock *up;
  787 
  788         /*
  789          * Load the vdev label and figure out which
  790          * uberblock is most current.
  791          */
  792         memset(&vtmp, 0, sizeof(vtmp));
  793         vtmp.v_phys_read = read;
  794         vtmp.v_read_priv = read_priv;
  795         off = offsetof(vdev_label_t, vl_vdev_phys);
  796         BP_ZERO(&bp);
  797         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
  798         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
  799         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  800         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  801         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
  802         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  803         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
  804                 return (EIO);
  805 
  806         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
  807                 return (EIO);
  808         }
  809 
  810         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
  811 
  812         if (nvlist_find(nvlist,
  813                         ZPOOL_CONFIG_VERSION,
  814                         DATA_TYPE_UINT64, 0, &val)) {
  815                 return (EIO);
  816         }
  817 
  818         if (val > SPA_VERSION) {
  819                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
  820                     (unsigned) val, (unsigned) SPA_VERSION);
  821                 return (EIO);
  822         }
  823 
  824         if (nvlist_find(nvlist,
  825                         ZPOOL_CONFIG_POOL_STATE,
  826                         DATA_TYPE_UINT64, 0, &val)) {
  827                 return (EIO);
  828         }
  829 
  830         if (val == POOL_STATE_DESTROYED) {
  831                 /* We don't boot only from destroyed pools. */
  832                 return (EIO);
  833         }
  834 
  835         if (nvlist_find(nvlist,
  836                         ZPOOL_CONFIG_POOL_TXG,
  837                         DATA_TYPE_UINT64, 0, &pool_txg)
  838             || nvlist_find(nvlist,
  839                            ZPOOL_CONFIG_POOL_GUID,
  840                            DATA_TYPE_UINT64, 0, &pool_guid)
  841             || nvlist_find(nvlist,
  842                            ZPOOL_CONFIG_POOL_NAME,
  843                            DATA_TYPE_STRING, 0, &pool_name)) {
  844                 /*
  845                  * Cache and spare devices end up here - just ignore
  846                  * them.
  847                  */
  848                 /*printf("ZFS: can't find pool details\n");*/
  849                 return (EIO);
  850         }
  851 
  852         is_log = 0;
  853         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
  854             &is_log);
  855         if (is_log)
  856                 return (EIO);
  857 
  858         /*
  859          * Create the pool if this is the first time we've seen it.
  860          */
  861         spa = spa_find_by_guid(pool_guid);
  862         if (!spa) {
  863                 spa = spa_create(pool_guid);
  864                 spa->spa_name = strdup(pool_name);
  865         }
  866         if (pool_txg > spa->spa_txg) {
  867                 spa->spa_txg = pool_txg;
  868                 is_newer = 1;
  869         } else
  870                 is_newer = 0;
  871 
  872         /*
  873          * Get the vdev tree and create our in-core copy of it.
  874          * If we already have a vdev with this guid, this must
  875          * be some kind of alias (overlapping slices, dangerously dedicated
  876          * disks etc).
  877          */
  878         if (nvlist_find(nvlist,
  879                         ZPOOL_CONFIG_GUID,
  880                         DATA_TYPE_UINT64, 0, &guid)) {
  881                 return (EIO);
  882         }
  883         vdev = vdev_find(guid);
  884         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
  885                 return (EIO);
  886 
  887         if (nvlist_find(nvlist,
  888                         ZPOOL_CONFIG_VDEV_TREE,
  889                         DATA_TYPE_NVLIST, 0, &vdevs)) {
  890                 return (EIO);
  891         }
  892 
  893         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
  894         if (rc)
  895                 return (rc);
  896 
  897         /*
  898          * Add the toplevel vdev to the pool if its not already there.
  899          */
  900         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
  901                 if (top_vdev == pool_vdev)
  902                         break;
  903         if (!pool_vdev && top_vdev)
  904                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
  905 
  906         /*
  907          * We should already have created an incomplete vdev for this
  908          * vdev. Find it and initialise it with our read proc.
  909          */
  910         vdev = vdev_find(guid);
  911         if (vdev) {
  912                 vdev->v_phys_read = read;
  913                 vdev->v_read_priv = read_priv;
  914                 vdev->v_state = VDEV_STATE_HEALTHY;
  915         } else {
  916                 printf("ZFS: inconsistent nvlist contents\n");
  917                 return (EIO);
  918         }
  919 
  920         /*
  921          * Re-evaluate top-level vdev state.
  922          */
  923         vdev_set_state(top_vdev);
  924 
  925         /*
  926          * Ok, we are happy with the pool so far. Lets find
  927          * the best uberblock and then we can actually access
  928          * the contents of the pool.
  929          */
  930         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
  931         up = (const struct uberblock *)upbuf;
  932         for (i = 0;
  933              i < VDEV_UBERBLOCK_COUNT(vdev);
  934              i++) {
  935                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
  936                 BP_ZERO(&bp);
  937                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
  938                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  939                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  940                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  941                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  942                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  943 
  944                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
  945                         continue;
  946 
  947                 if (up->ub_magic != UBERBLOCK_MAGIC)
  948                         continue;
  949                 if (up->ub_txg < spa->spa_txg)
  950                         continue;
  951                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
  952                         spa->spa_uberblock = *up;
  953                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
  954                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
  955                                 spa->spa_uberblock = *up;
  956                 }
  957         }
  958         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
  959 
  960         if (spap)
  961                 *spap = spa;
  962         return (0);
  963 }
  964 
  965 static int
  966 ilog2(int n)
  967 {
  968         int v;
  969 
  970         for (v = 0; v < 32; v++)
  971                 if (n == (1 << v))
  972                         return v;
  973         return -1;
  974 }
  975 
  976 static int
  977 zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
  978 {
  979         blkptr_t gbh_bp;
  980         zio_gbh_phys_t zio_gb;
  981         char *pbuf;
  982         int i;
  983 
  984         /* Artificial BP for gang block header. */
  985         gbh_bp = *bp;
  986         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  987         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  988         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
  989         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
  990         for (i = 0; i < SPA_DVAS_PER_BP; i++)
  991                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
  992 
  993         /* Read gang header block using the artificial BP. */
  994         if (zio_read(spa, &gbh_bp, &zio_gb))
  995                 return (EIO);
  996 
  997         pbuf = buf;
  998         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
  999                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
 1000 
 1001                 if (BP_IS_HOLE(gbp))
 1002                         continue;
 1003                 if (zio_read(spa, gbp, pbuf))
 1004                         return (EIO);
 1005                 pbuf += BP_GET_PSIZE(gbp);
 1006         }
 1007 
 1008         if (zio_checksum_verify(bp, buf))
 1009                 return (EIO);
 1010         return (0);
 1011 }
 1012 
 1013 static int
 1014 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
 1015 {
 1016         int cpfunc = BP_GET_COMPRESS(bp);
 1017         uint64_t align, size;
 1018         void *pbuf;
 1019         int i, error;
 1020 
 1021         error = EIO;
 1022 
 1023         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 1024                 const dva_t *dva = &bp->blk_dva[i];
 1025                 vdev_t *vdev;
 1026                 int vdevid;
 1027                 off_t offset;
 1028 
 1029                 if (!dva->dva_word[0] && !dva->dva_word[1])
 1030                         continue;
 1031 
 1032                 vdevid = DVA_GET_VDEV(dva);
 1033                 offset = DVA_GET_OFFSET(dva);
 1034                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 1035                         if (vdev->v_id == vdevid)
 1036                                 break;
 1037                 }
 1038                 if (!vdev || !vdev->v_read)
 1039                         continue;
 1040 
 1041                 size = BP_GET_PSIZE(bp);
 1042                 if (vdev->v_read == vdev_raidz_read) {
 1043                         align = 1ULL << vdev->v_top->v_ashift;
 1044                         if (P2PHASE(size, align) != 0)
 1045                                 size = P2ROUNDUP(size, align);
 1046                 }
 1047                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
 1048                         pbuf = zfs_alloc(size);
 1049                 else
 1050                         pbuf = buf;
 1051 
 1052                 if (DVA_GET_GANG(dva))
 1053                         error = zio_read_gang(spa, bp, pbuf);
 1054                 else
 1055                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
 1056                 if (error == 0) {
 1057                         if (cpfunc != ZIO_COMPRESS_OFF)
 1058                                 error = zio_decompress_data(cpfunc, pbuf,
 1059                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
 1060                         else if (size != BP_GET_PSIZE(bp))
 1061                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 1062                 }
 1063                 if (buf != pbuf)
 1064                         zfs_free(pbuf, size);
 1065                 if (error == 0)
 1066                         break;
 1067         }
 1068         if (error != 0)
 1069                 printf("ZFS: i/o error - all block copies unavailable\n");
 1070         return (error);
 1071 }
 1072 
 1073 static int
 1074 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
 1075 {
 1076         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 1077         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1078         int nlevels = dnode->dn_nlevels;
 1079         int i, rc;
 1080 
 1081         /*
 1082          * Note: bsize may not be a power of two here so we need to do an
 1083          * actual divide rather than a bitshift.
 1084          */
 1085         while (buflen > 0) {
 1086                 uint64_t bn = offset / bsize;
 1087                 int boff = offset % bsize;
 1088                 int ibn;
 1089                 const blkptr_t *indbp;
 1090                 blkptr_t bp;
 1091 
 1092                 if (bn > dnode->dn_maxblkid)
 1093                         return (EIO);
 1094 
 1095                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 1096                         goto cached;
 1097 
 1098                 indbp = dnode->dn_blkptr;
 1099                 for (i = 0; i < nlevels; i++) {
 1100                         /*
 1101                          * Copy the bp from the indirect array so that
 1102                          * we can re-use the scratch buffer for multi-level
 1103                          * objects.
 1104                          */
 1105                         ibn = bn >> ((nlevels - i - 1) * ibshift);
 1106                         ibn &= ((1 << ibshift) - 1);
 1107                         bp = indbp[ibn];
 1108                         rc = zio_read(spa, &bp, dnode_cache_buf);
 1109                         if (rc)
 1110                                 return (rc);
 1111                         indbp = (const blkptr_t *) dnode_cache_buf;
 1112                 }
 1113                 dnode_cache_obj = dnode;
 1114                 dnode_cache_bn = bn;
 1115         cached:
 1116 
 1117                 /*
 1118                  * The buffer contains our data block. Copy what we
 1119                  * need from it and loop.
 1120                  */ 
 1121                 i = bsize - boff;
 1122                 if (i > buflen) i = buflen;
 1123                 memcpy(buf, &dnode_cache_buf[boff], i);
 1124                 buf = ((char*) buf) + i;
 1125                 offset += i;
 1126                 buflen -= i;
 1127         }
 1128 
 1129         return (0);
 1130 }
 1131 
 1132 /*
 1133  * Lookup a value in a microzap directory. Assumes that the zap
 1134  * scratch buffer contains the directory contents.
 1135  */
 1136 static int
 1137 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1138 {
 1139         const mzap_phys_t *mz;
 1140         const mzap_ent_phys_t *mze;
 1141         size_t size;
 1142         int chunks, i;
 1143 
 1144         /*
 1145          * Microzap objects use exactly one block. Read the whole
 1146          * thing.
 1147          */
 1148         size = dnode->dn_datablkszsec * 512;
 1149 
 1150         mz = (const mzap_phys_t *) zap_scratch;
 1151         chunks = size / MZAP_ENT_LEN - 1;
 1152 
 1153         for (i = 0; i < chunks; i++) {
 1154                 mze = &mz->mz_chunk[i];
 1155                 if (!strcmp(mze->mze_name, name)) {
 1156                         *value = mze->mze_value;
 1157                         return (0);
 1158                 }
 1159         }
 1160 
 1161         return (ENOENT);
 1162 }
 1163 
 1164 /*
 1165  * Compare a name with a zap leaf entry. Return non-zero if the name
 1166  * matches.
 1167  */
 1168 static int
 1169 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 1170 {
 1171         size_t namelen;
 1172         const zap_leaf_chunk_t *nc;
 1173         const char *p;
 1174 
 1175         namelen = zc->l_entry.le_name_length;
 1176                         
 1177         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1178         p = name;
 1179         while (namelen > 0) {
 1180                 size_t len;
 1181                 len = namelen;
 1182                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1183                         len = ZAP_LEAF_ARRAY_BYTES;
 1184                 if (memcmp(p, nc->l_array.la_array, len))
 1185                         return (0);
 1186                 p += len;
 1187                 namelen -= len;
 1188                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1189         }
 1190 
 1191         return 1;
 1192 }
 1193 
 1194 /*
 1195  * Extract a uint64_t value from a zap leaf entry.
 1196  */
 1197 static uint64_t
 1198 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 1199 {
 1200         const zap_leaf_chunk_t *vc;
 1201         int i;
 1202         uint64_t value;
 1203         const uint8_t *p;
 1204 
 1205         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 1206         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 1207                 value = (value << 8) | p[i];
 1208         }
 1209 
 1210         return value;
 1211 }
 1212 
 1213 /*
 1214  * Lookup a value in a fatzap directory. Assumes that the zap scratch
 1215  * buffer contains the directory header.
 1216  */
 1217 static int
 1218 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1219 {
 1220         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1221         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1222         fat_zap_t z;
 1223         uint64_t *ptrtbl;
 1224         uint64_t hash;
 1225         int rc;
 1226 
 1227         if (zh.zap_magic != ZAP_MAGIC)
 1228                 return (EIO);
 1229 
 1230         z.zap_block_shift = ilog2(bsize);
 1231         z.zap_phys = (zap_phys_t *) zap_scratch;
 1232 
 1233         /*
 1234          * Figure out where the pointer table is and read it in if necessary.
 1235          */
 1236         if (zh.zap_ptrtbl.zt_blk) {
 1237                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1238                                zap_scratch, bsize);
 1239                 if (rc)
 1240                         return (rc);
 1241                 ptrtbl = (uint64_t *) zap_scratch;
 1242         } else {
 1243                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1244         }
 1245 
 1246         hash = zap_hash(zh.zap_salt, name);
 1247 
 1248         zap_leaf_t zl;
 1249         zl.l_bs = z.zap_block_shift;
 1250 
 1251         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1252         zap_leaf_chunk_t *zc;
 1253 
 1254         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1255         if (rc)
 1256                 return (rc);
 1257 
 1258         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1259 
 1260         /*
 1261          * Make sure this chunk matches our hash.
 1262          */
 1263         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1264             && zl.l_phys->l_hdr.lh_prefix
 1265             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1266                 return (ENOENT);
 1267 
 1268         /*
 1269          * Hash within the chunk to find our entry.
 1270          */
 1271         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1272         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1273         h = zl.l_phys->l_hash[h];
 1274         if (h == 0xffff)
 1275                 return (ENOENT);
 1276         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1277         while (zc->l_entry.le_hash != hash) {
 1278                 if (zc->l_entry.le_next == 0xffff) {
 1279                         zc = 0;
 1280                         break;
 1281                 }
 1282                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1283         }
 1284         if (fzap_name_equal(&zl, zc, name)) {
 1285                 *value = fzap_leaf_value(&zl, zc);
 1286                 return (0);
 1287         }
 1288 
 1289         return (ENOENT);
 1290 }
 1291 
 1292 /*
 1293  * Lookup a name in a zap object and return its value as a uint64_t.
 1294  */
 1295 static int
 1296 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1297 {
 1298         int rc;
 1299         uint64_t zap_type;
 1300         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1301 
 1302         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1303         if (rc)
 1304                 return (rc);
 1305 
 1306         zap_type = *(uint64_t *) zap_scratch;
 1307         if (zap_type == ZBT_MICRO)
 1308                 return mzap_lookup(spa, dnode, name, value);
 1309         else if (zap_type == ZBT_HEADER)
 1310                 return fzap_lookup(spa, dnode, name, value);
 1311         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
 1312         return (EIO);
 1313 }
 1314 
 1315 #ifdef BOOT2
 1316 
 1317 /*
 1318  * List a microzap directory. Assumes that the zap scratch buffer contains
 1319  * the directory contents.
 1320  */
 1321 static int
 1322 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1323 {
 1324         const mzap_phys_t *mz;
 1325         const mzap_ent_phys_t *mze;
 1326         size_t size;
 1327         int chunks, i;
 1328 
 1329         /*
 1330          * Microzap objects use exactly one block. Read the whole
 1331          * thing.
 1332          */
 1333         size = dnode->dn_datablkszsec * 512;
 1334         mz = (const mzap_phys_t *) zap_scratch;
 1335         chunks = size / MZAP_ENT_LEN - 1;
 1336 
 1337         for (i = 0; i < chunks; i++) {
 1338                 mze = &mz->mz_chunk[i];
 1339                 if (mze->mze_name[0])
 1340                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
 1341                         printf("%s\n", mze->mze_name);
 1342         }
 1343 
 1344         return (0);
 1345 }
 1346 
 1347 /*
 1348  * List a fatzap directory. Assumes that the zap scratch buffer contains
 1349  * the directory header.
 1350  */
 1351 static int
 1352 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1353 {
 1354         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1355         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1356         fat_zap_t z;
 1357         int i, j;
 1358 
 1359         if (zh.zap_magic != ZAP_MAGIC)
 1360                 return (EIO);
 1361 
 1362         z.zap_block_shift = ilog2(bsize);
 1363         z.zap_phys = (zap_phys_t *) zap_scratch;
 1364 
 1365         /*
 1366          * This assumes that the leaf blocks start at block 1. The
 1367          * documentation isn't exactly clear on this.
 1368          */
 1369         zap_leaf_t zl;
 1370         zl.l_bs = z.zap_block_shift;
 1371         for (i = 0; i < zh.zap_num_leafs; i++) {
 1372                 off_t off = (i + 1) << zl.l_bs;
 1373                 char name[256], *p;
 1374                 uint64_t value;
 1375 
 1376                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 1377                         return (EIO);
 1378 
 1379                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1380 
 1381                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 1382                         zap_leaf_chunk_t *zc, *nc;
 1383                         int namelen;
 1384 
 1385                         zc = &ZAP_LEAF_CHUNK(&zl, j);
 1386                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 1387                                 continue;
 1388                         namelen = zc->l_entry.le_name_length;
 1389                         if (namelen > sizeof(name))
 1390                                 namelen = sizeof(name);
 1391                         
 1392                         /*
 1393                          * Paste the name back together.
 1394                          */
 1395                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 1396                         p = name;
 1397                         while (namelen > 0) {
 1398                                 int len;
 1399                                 len = namelen;
 1400                                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1401                                         len = ZAP_LEAF_ARRAY_BYTES;
 1402                                 memcpy(p, nc->l_array.la_array, len);
 1403                                 p += len;
 1404                                 namelen -= len;
 1405                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 1406                         }
 1407 
 1408                         /*
 1409                          * Assume the first eight bytes of the value are
 1410                          * a uint64_t.
 1411                          */
 1412                         value = fzap_leaf_value(&zl, zc);
 1413 
 1414                         printf("%s 0x%llx\n", name, value);
 1415                 }
 1416         }
 1417 
 1418         return (0);
 1419 }
 1420 
 1421 /*
 1422  * List a zap directory.
 1423  */
 1424 static int
 1425 zap_list(spa_t *spa, const dnode_phys_t *dnode)
 1426 {
 1427         uint64_t zap_type;
 1428         size_t size = dnode->dn_datablkszsec * 512;
 1429 
 1430         if (dnode_read(spa, dnode, 0, zap_scratch, size))
 1431                 return (EIO);
 1432 
 1433         zap_type = *(uint64_t *) zap_scratch;
 1434         if (zap_type == ZBT_MICRO)
 1435                 return mzap_list(spa, dnode);
 1436         else
 1437                 return fzap_list(spa, dnode);
 1438 }
 1439 
 1440 #endif
 1441 
 1442 static int
 1443 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 1444 {
 1445         off_t offset;
 1446 
 1447         offset = objnum * sizeof(dnode_phys_t);
 1448         return dnode_read(spa, &os->os_meta_dnode, offset,
 1449                 dnode, sizeof(dnode_phys_t));
 1450 }
 1451 
 1452 /*
 1453  * Find the object set given the object number of its dataset object
 1454  * and return its details in *objset
 1455  */
 1456 static int
 1457 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 1458 {
 1459         dnode_phys_t dataset;
 1460         dsl_dataset_phys_t *ds;
 1461 
 1462         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1463                 printf("ZFS: can't find dataset %llu\n", objnum);
 1464                 return (EIO);
 1465         }
 1466 
 1467         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 1468         if (zio_read(spa, &ds->ds_bp, objset)) {
 1469                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
 1470                 return (EIO);
 1471         }
 1472 
 1473         return (0);
 1474 }
 1475 
 1476 /*
 1477  * Find the object set pointed to by the BOOTFS property or the root
 1478  * dataset if there is none and return its details in *objset
 1479  */
 1480 static int
 1481 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
 1482 {
 1483         dnode_phys_t dir, propdir;
 1484         uint64_t props, bootfs, root;
 1485 
 1486         /*
 1487          * Start with the MOS directory object.
 1488          */
 1489         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 1490                 printf("ZFS: can't read MOS object directory\n");
 1491                 return (EIO);
 1492         }
 1493 
 1494         /*
 1495          * Lookup the pool_props and see if we can find a bootfs.
 1496          */
 1497         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
 1498              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 1499              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
 1500              && bootfs != 0)
 1501                 return zfs_mount_dataset(spa, bootfs, objset);
 1502 
 1503         /*
 1504          * Lookup the root dataset directory
 1505          */
 1506         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
 1507             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 1508                 printf("ZFS: can't find root dsl_dir\n");
 1509                 return (EIO);
 1510         }
 1511 
 1512         /*
 1513          * Use the information from the dataset directory's bonus buffer
 1514          * to find the dataset object and from that the object set itself.
 1515          */
 1516         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 1517         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
 1518 }
 1519 
 1520 static int
 1521 zfs_mount_pool(spa_t *spa)
 1522 {
 1523 
 1524         /*
 1525          * Find the MOS and work our way in from there.
 1526          */
 1527         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 1528                 printf("ZFS: can't read MOS\n");
 1529                 return (EIO);
 1530         }
 1531 
 1532         /*
 1533          * Find the root object set
 1534          */
 1535         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
 1536                 printf("Can't find root filesystem - giving up\n");
 1537                 return (EIO);
 1538         }
 1539 
 1540         return (0);
 1541 }
 1542 
 1543 static int
 1544 zfs_dnode_stat(spa_t *spa, dnode_phys_t *dn, struct stat *sb)
 1545 {
 1546 
 1547         if (dn->dn_bonustype != DMU_OT_SA) {
 1548                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
 1549 
 1550                 sb->st_mode = zp->zp_mode;
 1551                 sb->st_uid = zp->zp_uid;
 1552                 sb->st_gid = zp->zp_gid;
 1553                 sb->st_size = zp->zp_size;
 1554         } else {
 1555                 sa_hdr_phys_t *sahdrp;
 1556                 int hdrsize;
 1557                 size_t size = 0;
 1558                 void *buf = NULL;
 1559 
 1560                 if (dn->dn_bonuslen != 0)
 1561                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
 1562                 else {
 1563                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
 1564                                 blkptr_t *bp = &dn->dn_spill;
 1565                                 int error;
 1566 
 1567                                 size = BP_GET_LSIZE(bp);
 1568                                 buf = zfs_alloc(size);
 1569                                 error = zio_read(spa, bp, buf);
 1570                                 if (error != 0) {
 1571                                         zfs_free(buf, size);
 1572                                         return (error);
 1573                                 }
 1574                                 sahdrp = buf;
 1575                         } else {
 1576                                 return (EIO);
 1577                         }
 1578                 }
 1579                 hdrsize = SA_HDR_SIZE(sahdrp);
 1580                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
 1581                     SA_MODE_OFFSET);
 1582                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1583                     SA_UID_OFFSET);
 1584                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1585                     SA_GID_OFFSET);
 1586                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
 1587                     SA_SIZE_OFFSET);
 1588                 if (buf != NULL)
 1589                         zfs_free(buf, size);
 1590         }
 1591 
 1592         return (0);
 1593 }
 1594 
 1595 /*
 1596  * Lookup a file and return its dnode.
 1597  */
 1598 static int
 1599 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
 1600 {
 1601         int rc;
 1602         uint64_t objnum, rootnum, parentnum;
 1603         dnode_phys_t dn;
 1604         const char *p, *q;
 1605         char element[256];
 1606         char path[1024];
 1607         int symlinks_followed = 0;
 1608         struct stat sb;
 1609 
 1610         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
 1611                 printf("ZFS: unexpected object set type %llu\n",
 1612                        spa->spa_root_objset.os_type);
 1613                 return (EIO);
 1614         }
 1615 
 1616         /*
 1617          * Get the root directory dnode.
 1618          */
 1619         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
 1620         if (rc)
 1621                 return (rc);
 1622 
 1623         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
 1624         if (rc)
 1625                 return (rc);
 1626 
 1627         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
 1628         if (rc)
 1629                 return (rc);
 1630 
 1631         objnum = rootnum;
 1632         p = upath;
 1633         while (p && *p) {
 1634                 while (*p == '/')
 1635                         p++;
 1636                 if (!*p)
 1637                         break;
 1638                 q = strchr(p, '/');
 1639                 if (q) {
 1640                         memcpy(element, p, q - p);
 1641                         element[q - p] = 0;
 1642                         p = q;
 1643                 } else {
 1644                         strcpy(element, p);
 1645                         p = 0;
 1646                 }
 1647 
 1648                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1649                 if (rc)
 1650                         return (rc);
 1651                 if (!S_ISDIR(sb.st_mode))
 1652                         return (ENOTDIR);
 1653 
 1654                 parentnum = objnum;
 1655                 rc = zap_lookup(spa, &dn, element, &objnum);
 1656                 if (rc)
 1657                         return (rc);
 1658                 objnum = ZFS_DIRENT_OBJ(objnum);
 1659 
 1660                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1661                 if (rc)
 1662                         return (rc);
 1663 
 1664                 /*
 1665                  * Check for symlink.
 1666                  */
 1667                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1668                 if (rc)
 1669                         return (rc);
 1670                 if (S_ISLNK(sb.st_mode)) {
 1671                         if (symlinks_followed > 10)
 1672                                 return (EMLINK);
 1673                         symlinks_followed++;
 1674 
 1675                         /*
 1676                          * Read the link value and copy the tail of our
 1677                          * current path onto the end.
 1678                          */
 1679                         if (p)
 1680                                 strcpy(&path[sb.st_size], p);
 1681                         else
 1682                                 path[sb.st_size] = 0;
 1683                         if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
 1684                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 1685                                         sb.st_size);
 1686                         } else {
 1687                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
 1688                                 if (rc)
 1689                                         return (rc);
 1690                         }
 1691 
 1692                         /*
 1693                          * Restart with the new path, starting either at
 1694                          * the root or at the parent depending whether or
 1695                          * not the link is relative.
 1696                          */
 1697                         p = path;
 1698                         if (*p == '/')
 1699                                 objnum = rootnum;
 1700                         else
 1701                                 objnum = parentnum;
 1702                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1703                 }
 1704         }
 1705 
 1706         *dnode = dn;
 1707         return (0);
 1708 }

Cache object: 409a8054329b76d678c16cead5b2ebef


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.