The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c

Version: -  FREEBSD  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-2  -  FREEBSD-11-1  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-4  -  FREEBSD-10-3  -  FREEBSD-10-2  -  FREEBSD-10-1  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-3  -  FREEBSD-9-2  -  FREEBSD-9-1  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-4  -  FREEBSD-8-3  -  FREEBSD-8-2  -  FREEBSD-8-1  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-4  -  FREEBSD-7-3  -  FREEBSD-7-2  -  FREEBSD-7-1  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-4  -  FREEBSD-6-3  -  FREEBSD-6-2  -  FREEBSD-6-1  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-5  -  FREEBSD-5-4  -  FREEBSD-5-3  -  FREEBSD-5-2  -  FREEBSD-5-1  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  xnu-1456.1.26  -  xnu-1699.24.8  -  xnu-2050.18.24  -  OPENSOLARIS  -  minix-3-1-1 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2007 Doug Rabson
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/8.3/sys/boot/zfs/zfsimpl.c 230164 2012-01-15 21:50:17Z avg $");
   29 
   30 /*
   31  *      Stand-alone ZFS file reader.
   32  */
   33 
   34 #include <sys/stat.h>
   35 
   36 #include "zfsimpl.h"
   37 #include "zfssubr.c"
   38 
   39 /*
   40  * List of all vdevs, chained through v_alllink.
   41  */
   42 static vdev_list_t zfs_vdevs;
   43 
   44 /*
   45  * List of all pools, chained through spa_link.
   46  */
   47 static spa_list_t zfs_pools;
   48 
   49 static uint64_t zfs_crc64_table[256];
   50 static const dnode_phys_t *dnode_cache_obj = 0;
   51 static uint64_t dnode_cache_bn;
   52 static char *dnode_cache_buf;
   53 static char *zap_scratch;
   54 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
   55 
   56 #define TEMP_SIZE       (1024 * 1024)
   57 
   58 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
   59 
   60 static void
   61 zfs_init(void)
   62 {
   63         STAILQ_INIT(&zfs_vdevs);
   64         STAILQ_INIT(&zfs_pools);
   65 
   66         zfs_temp_buf = malloc(TEMP_SIZE);
   67         zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
   68         zfs_temp_ptr = zfs_temp_buf;
   69         dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
   70         zap_scratch = malloc(SPA_MAXBLOCKSIZE);
   71 
   72         zfs_init_crc();
   73 }
   74 
   75 static void *
   76 zfs_alloc(size_t size)
   77 {
   78         char *ptr;
   79 
   80         if (zfs_temp_ptr + size > zfs_temp_end) {
   81                 printf("ZFS: out of temporary buffer space\n");
   82                 for (;;) ;
   83         }
   84         ptr = zfs_temp_ptr;
   85         zfs_temp_ptr += size;
   86 
   87         return (ptr);
   88 }
   89 
   90 static void
   91 zfs_free(void *ptr, size_t size)
   92 {
   93 
   94         zfs_temp_ptr -= size;
   95         if (zfs_temp_ptr != ptr) {
   96                 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
   97                 for (;;) ;
   98         }
   99 }
  100 
  101 static int
  102 xdr_int(const unsigned char **xdr, int *ip)
  103 {
  104         *ip = ((*xdr)[0] << 24)
  105                 | ((*xdr)[1] << 16)
  106                 | ((*xdr)[2] << 8)
  107                 | ((*xdr)[3] << 0);
  108         (*xdr) += 4;
  109         return (0);
  110 }
  111 
  112 static int
  113 xdr_u_int(const unsigned char **xdr, u_int *ip)
  114 {
  115         *ip = ((*xdr)[0] << 24)
  116                 | ((*xdr)[1] << 16)
  117                 | ((*xdr)[2] << 8)
  118                 | ((*xdr)[3] << 0);
  119         (*xdr) += 4;
  120         return (0);
  121 }
  122 
  123 static int
  124 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
  125 {
  126         u_int hi, lo;
  127 
  128         xdr_u_int(xdr, &hi);
  129         xdr_u_int(xdr, &lo);
  130         *lp = (((uint64_t) hi) << 32) | lo;
  131         return (0);
  132 }
  133 
  134 static int
  135 nvlist_find(const unsigned char *nvlist, const char *name, int type,
  136             int* elementsp, void *valuep)
  137 {
  138         const unsigned char *p, *pair;
  139         int junk;
  140         int encoded_size, decoded_size;
  141 
  142         p = nvlist;
  143         xdr_int(&p, &junk);
  144         xdr_int(&p, &junk);
  145 
  146         pair = p;
  147         xdr_int(&p, &encoded_size);
  148         xdr_int(&p, &decoded_size);
  149         while (encoded_size && decoded_size) {
  150                 int namelen, pairtype, elements;
  151                 const char *pairname;
  152 
  153                 xdr_int(&p, &namelen);
  154                 pairname = (const char*) p;
  155                 p += roundup(namelen, 4);
  156                 xdr_int(&p, &pairtype);
  157 
  158                 if (!memcmp(name, pairname, namelen) && type == pairtype) {
  159                         xdr_int(&p, &elements);
  160                         if (elementsp)
  161                                 *elementsp = elements;
  162                         if (type == DATA_TYPE_UINT64) {
  163                                 xdr_uint64_t(&p, (uint64_t *) valuep);
  164                                 return (0);
  165                         } else if (type == DATA_TYPE_STRING) {
  166                                 int len;
  167                                 xdr_int(&p, &len);
  168                                 (*(const char**) valuep) = (const char*) p;
  169                                 return (0);
  170                         } else if (type == DATA_TYPE_NVLIST
  171                                    || type == DATA_TYPE_NVLIST_ARRAY) {
  172                                 (*(const unsigned char**) valuep) =
  173                                          (const unsigned char*) p;
  174                                 return (0);
  175                         } else {
  176                                 return (EIO);
  177                         }
  178                 } else {
  179                         /*
  180                          * Not the pair we are looking for, skip to the next one.
  181                          */
  182                         p = pair + encoded_size;
  183                 }
  184 
  185                 pair = p;
  186                 xdr_int(&p, &encoded_size);
  187                 xdr_int(&p, &decoded_size);
  188         }
  189 
  190         return (EIO);
  191 }
  192 
  193 /*
  194  * Return the next nvlist in an nvlist array.
  195  */
  196 static const unsigned char *
  197 nvlist_next(const unsigned char *nvlist)
  198 {
  199         const unsigned char *p, *pair;
  200         int junk;
  201         int encoded_size, decoded_size;
  202 
  203         p = nvlist;
  204         xdr_int(&p, &junk);
  205         xdr_int(&p, &junk);
  206 
  207         pair = p;
  208         xdr_int(&p, &encoded_size);
  209         xdr_int(&p, &decoded_size);
  210         while (encoded_size && decoded_size) {
  211                 p = pair + encoded_size;
  212 
  213                 pair = p;
  214                 xdr_int(&p, &encoded_size);
  215                 xdr_int(&p, &decoded_size);
  216         }
  217 
  218         return p;
  219 }
  220 
  221 #ifdef TEST
  222 
  223 static const unsigned char *
  224 nvlist_print(const unsigned char *nvlist, unsigned int indent)
  225 {
  226         static const char* typenames[] = {
  227                 "DATA_TYPE_UNKNOWN",
  228                 "DATA_TYPE_BOOLEAN",
  229                 "DATA_TYPE_BYTE",
  230                 "DATA_TYPE_INT16",
  231                 "DATA_TYPE_UINT16",
  232                 "DATA_TYPE_INT32",
  233                 "DATA_TYPE_UINT32",
  234                 "DATA_TYPE_INT64",
  235                 "DATA_TYPE_UINT64",
  236                 "DATA_TYPE_STRING",
  237                 "DATA_TYPE_BYTE_ARRAY",
  238                 "DATA_TYPE_INT16_ARRAY",
  239                 "DATA_TYPE_UINT16_ARRAY",
  240                 "DATA_TYPE_INT32_ARRAY",
  241                 "DATA_TYPE_UINT32_ARRAY",
  242                 "DATA_TYPE_INT64_ARRAY",
  243                 "DATA_TYPE_UINT64_ARRAY",
  244                 "DATA_TYPE_STRING_ARRAY",
  245                 "DATA_TYPE_HRTIME",
  246                 "DATA_TYPE_NVLIST",
  247                 "DATA_TYPE_NVLIST_ARRAY",
  248                 "DATA_TYPE_BOOLEAN_VALUE",
  249                 "DATA_TYPE_INT8",
  250                 "DATA_TYPE_UINT8",
  251                 "DATA_TYPE_BOOLEAN_ARRAY",
  252                 "DATA_TYPE_INT8_ARRAY",
  253                 "DATA_TYPE_UINT8_ARRAY"
  254         };
  255 
  256         unsigned int i, j;
  257         const unsigned char *p, *pair;
  258         int junk;
  259         int encoded_size, decoded_size;
  260 
  261         p = nvlist;
  262         xdr_int(&p, &junk);
  263         xdr_int(&p, &junk);
  264 
  265         pair = p;
  266         xdr_int(&p, &encoded_size);
  267         xdr_int(&p, &decoded_size);
  268         while (encoded_size && decoded_size) {
  269                 int namelen, pairtype, elements;
  270                 const char *pairname;
  271 
  272                 xdr_int(&p, &namelen);
  273                 pairname = (const char*) p;
  274                 p += roundup(namelen, 4);
  275                 xdr_int(&p, &pairtype);
  276 
  277                 for (i = 0; i < indent; i++)
  278                         printf(" ");
  279                 printf("%s %s", typenames[pairtype], pairname);
  280 
  281                 xdr_int(&p, &elements);
  282                 switch (pairtype) {
  283                 case DATA_TYPE_UINT64: {
  284                         uint64_t val;
  285                         xdr_uint64_t(&p, &val);
  286                         printf(" = 0x%llx\n", val);
  287                         break;
  288                 }
  289 
  290                 case DATA_TYPE_STRING: {
  291                         int len;
  292                         xdr_int(&p, &len);
  293                         printf(" = \"%s\"\n", p);
  294                         break;
  295                 }
  296 
  297                 case DATA_TYPE_NVLIST:
  298                         printf("\n");
  299                         nvlist_print(p, indent + 1);
  300                         break;
  301 
  302                 case DATA_TYPE_NVLIST_ARRAY:
  303                         for (j = 0; j < elements; j++) {
  304                                 printf("[%d]\n", j);
  305                                 p = nvlist_print(p, indent + 1);
  306                                 if (j != elements - 1) {
  307                                         for (i = 0; i < indent; i++)
  308                                                 printf(" ");
  309                                         printf("%s %s", typenames[pairtype], pairname);
  310                                 }
  311                         }
  312                         break;
  313 
  314                 default:
  315                         printf("\n");
  316                 }
  317 
  318                 p = pair + encoded_size;
  319 
  320                 pair = p;
  321                 xdr_int(&p, &encoded_size);
  322                 xdr_int(&p, &decoded_size);
  323         }
  324 
  325         return p;
  326 }
  327 
  328 #endif
  329 
  330 static int
  331 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
  332     off_t offset, size_t size)
  333 {
  334         size_t psize;
  335         int rc;
  336 
  337         if (!vdev->v_phys_read)
  338                 return (EIO);
  339 
  340         if (bp) {
  341                 psize = BP_GET_PSIZE(bp);
  342         } else {
  343                 psize = size;
  344         }
  345 
  346         /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
  347         rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
  348         if (rc)
  349                 return (rc);
  350         if (bp && zio_checksum_verify(bp, buf))
  351                 return (EIO);
  352 
  353         return (0);
  354 }
  355 
  356 static int
  357 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  358     off_t offset, size_t bytes)
  359 {
  360 
  361         return (vdev_read_phys(vdev, bp, buf,
  362                 offset + VDEV_LABEL_START_SIZE, bytes));
  363 }
  364 
  365 
  366 static int
  367 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  368     off_t offset, size_t bytes)
  369 {
  370         vdev_t *kid;
  371         int rc;
  372 
  373         rc = EIO;
  374         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  375                 if (kid->v_state != VDEV_STATE_HEALTHY)
  376                         continue;
  377                 rc = kid->v_read(kid, bp, buf, offset, bytes);
  378                 if (!rc)
  379                         return (0);
  380         }
  381 
  382         return (rc);
  383 }
  384 
  385 static int
  386 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
  387     off_t offset, size_t bytes)
  388 {
  389         vdev_t *kid;
  390 
  391         /*
  392          * Here we should have two kids:
  393          * First one which is the one we are replacing and we can trust
  394          * only this one to have valid data, but it might not be present.
  395          * Second one is that one we are replacing with. It is most likely
  396          * healthy, but we can't trust it has needed data, so we won't use it.
  397          */
  398         kid = STAILQ_FIRST(&vdev->v_children);
  399         if (kid == NULL)
  400                 return (EIO);
  401         if (kid->v_state != VDEV_STATE_HEALTHY)
  402                 return (EIO);
  403         return (kid->v_read(kid, bp, buf, offset, bytes));
  404 }
  405 
  406 static vdev_t *
  407 vdev_find(uint64_t guid)
  408 {
  409         vdev_t *vdev;
  410 
  411         STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
  412                 if (vdev->v_guid == guid)
  413                         return (vdev);
  414 
  415         return (0);
  416 }
  417 
  418 static vdev_t *
  419 vdev_create(uint64_t guid, vdev_read_t *read)
  420 {
  421         vdev_t *vdev;
  422 
  423         vdev = malloc(sizeof(vdev_t));
  424         memset(vdev, 0, sizeof(vdev_t));
  425         STAILQ_INIT(&vdev->v_children);
  426         vdev->v_guid = guid;
  427         vdev->v_state = VDEV_STATE_OFFLINE;
  428         vdev->v_read = read;
  429         vdev->v_phys_read = 0;
  430         vdev->v_read_priv = 0;
  431         STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
  432 
  433         return (vdev);
  434 }
  435 
  436 static int
  437 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
  438     vdev_t **vdevp, int is_newer)
  439 {
  440         int rc;
  441         uint64_t guid, id, ashift, nparity;
  442         const char *type;
  443         const char *path;
  444         vdev_t *vdev, *kid;
  445         const unsigned char *kids;
  446         int nkids, i, is_new;
  447         uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
  448 
  449         if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
  450                         DATA_TYPE_UINT64, 0, &guid)
  451             || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
  452                            DATA_TYPE_UINT64, 0, &id)
  453             || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
  454                            DATA_TYPE_STRING, 0, &type)) {
  455                 printf("ZFS: can't find vdev details\n");
  456                 return (ENOENT);
  457         }
  458 
  459         if (strcmp(type, VDEV_TYPE_MIRROR)
  460             && strcmp(type, VDEV_TYPE_DISK)
  461 #ifdef ZFS_TEST
  462             && strcmp(type, VDEV_TYPE_FILE)
  463 #endif
  464             && strcmp(type, VDEV_TYPE_RAIDZ)
  465             && strcmp(type, VDEV_TYPE_REPLACING)) {
  466                 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  467                 return (EIO);
  468         }
  469 
  470         is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
  471 
  472         nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
  473                         &is_offline);
  474         nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
  475                         &is_removed);
  476         nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
  477                         &is_faulted);
  478         nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
  479                         &is_degraded);
  480         nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
  481                         &isnt_present);
  482 
  483         vdev = vdev_find(guid);
  484         if (!vdev) {
  485                 is_new = 1;
  486 
  487                 if (!strcmp(type, VDEV_TYPE_MIRROR))
  488                         vdev = vdev_create(guid, vdev_mirror_read);
  489                 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
  490                         vdev = vdev_create(guid, vdev_raidz_read);
  491                 else if (!strcmp(type, VDEV_TYPE_REPLACING))
  492                         vdev = vdev_create(guid, vdev_replacing_read);
  493                 else
  494                         vdev = vdev_create(guid, vdev_disk_read);
  495 
  496                 vdev->v_id = id;
  497                 vdev->v_top = pvdev != NULL ? pvdev : vdev;
  498                 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
  499                         DATA_TYPE_UINT64, 0, &ashift) == 0)
  500                         vdev->v_ashift = ashift;
  501                 else
  502                         vdev->v_ashift = 0;
  503                 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
  504                         DATA_TYPE_UINT64, 0, &nparity) == 0)
  505                         vdev->v_nparity = nparity;
  506                 else
  507                         vdev->v_nparity = 0;
  508                 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
  509                                 DATA_TYPE_STRING, 0, &path) == 0) {
  510                         if (strncmp(path, "/dev/", 5) == 0)
  511                                 path += 5;
  512                         vdev->v_name = strdup(path);
  513                 } else {
  514                         if (!strcmp(type, "raidz")) {
  515                                 if (vdev->v_nparity == 1)
  516                                         vdev->v_name = "raidz1";
  517                                 else if (vdev->v_nparity == 2)
  518                                         vdev->v_name = "raidz2";
  519                                 else if (vdev->v_nparity == 3)
  520                                         vdev->v_name = "raidz3";
  521                                 else {
  522                                         printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
  523                                         return (EIO);
  524                                 }
  525                         } else {
  526                                 vdev->v_name = strdup(type);
  527                         }
  528                 }
  529         } else {
  530                 is_new = 0;
  531         }
  532 
  533         if (is_new || is_newer) {
  534                 /*
  535                  * This is either new vdev or we've already seen this vdev,
  536                  * but from an older vdev label, so let's refresh its state
  537                  * from the newer label.
  538                  */
  539                 if (is_offline)
  540                         vdev->v_state = VDEV_STATE_OFFLINE;
  541                 else if (is_removed)
  542                         vdev->v_state = VDEV_STATE_REMOVED;
  543                 else if (is_faulted)
  544                         vdev->v_state = VDEV_STATE_FAULTED;
  545                 else if (is_degraded)
  546                         vdev->v_state = VDEV_STATE_DEGRADED;
  547                 else if (isnt_present)
  548                         vdev->v_state = VDEV_STATE_CANT_OPEN;
  549         }
  550 
  551         rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
  552                          DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
  553         /*
  554          * Its ok if we don't have any kids.
  555          */
  556         if (rc == 0) {
  557                 vdev->v_nchildren = nkids;
  558                 for (i = 0; i < nkids; i++) {
  559                         rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
  560                         if (rc)
  561                                 return (rc);
  562                         if (is_new)
  563                                 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
  564                                                    v_childlink);
  565                         kids = nvlist_next(kids);
  566                 }
  567         } else {
  568                 vdev->v_nchildren = 0;
  569         }
  570 
  571         if (vdevp)
  572                 *vdevp = vdev;
  573         return (0);
  574 }
  575 
  576 static void
  577 vdev_set_state(vdev_t *vdev)
  578 {
  579         vdev_t *kid;
  580         int good_kids;
  581         int bad_kids;
  582 
  583         /*
  584          * A mirror or raidz is healthy if all its kids are healthy. A
  585          * mirror is degraded if any of its kids is healthy; a raidz
  586          * is degraded if at most nparity kids are offline.
  587          */
  588         if (STAILQ_FIRST(&vdev->v_children)) {
  589                 good_kids = 0;
  590                 bad_kids = 0;
  591                 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  592                         if (kid->v_state == VDEV_STATE_HEALTHY)
  593                                 good_kids++;
  594                         else
  595                                 bad_kids++;
  596                 }
  597                 if (bad_kids == 0) {
  598                         vdev->v_state = VDEV_STATE_HEALTHY;
  599                 } else {
  600                         if (vdev->v_read == vdev_mirror_read) {
  601                                 if (good_kids) {
  602                                         vdev->v_state = VDEV_STATE_DEGRADED;
  603                                 } else {
  604                                         vdev->v_state = VDEV_STATE_OFFLINE;
  605                                 }
  606                         } else if (vdev->v_read == vdev_raidz_read) {
  607                                 if (bad_kids > vdev->v_nparity) {
  608                                         vdev->v_state = VDEV_STATE_OFFLINE;
  609                                 } else {
  610                                         vdev->v_state = VDEV_STATE_DEGRADED;
  611                                 }
  612                         }
  613                 }
  614         }
  615 }
  616 
  617 static spa_t *
  618 spa_find_by_guid(uint64_t guid)
  619 {
  620         spa_t *spa;
  621 
  622         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  623                 if (spa->spa_guid == guid)
  624                         return (spa);
  625 
  626         return (0);
  627 }
  628 
  629 #ifdef BOOT2
  630 
  631 static spa_t *
  632 spa_find_by_name(const char *name)
  633 {
  634         spa_t *spa;
  635 
  636         STAILQ_FOREACH(spa, &zfs_pools, spa_link)
  637                 if (!strcmp(spa->spa_name, name))
  638                         return (spa);
  639 
  640         return (0);
  641 }
  642 
  643 #endif
  644 
  645 static spa_t *
  646 spa_create(uint64_t guid)
  647 {
  648         spa_t *spa;
  649 
  650         spa = malloc(sizeof(spa_t));
  651         memset(spa, 0, sizeof(spa_t));
  652         STAILQ_INIT(&spa->spa_vdevs);
  653         spa->spa_guid = guid;
  654         STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
  655 
  656         return (spa);
  657 }
  658 
  659 static const char *
  660 state_name(vdev_state_t state)
  661 {
  662         static const char* names[] = {
  663                 "UNKNOWN",
  664                 "CLOSED",
  665                 "OFFLINE",
  666                 "REMOVED",
  667                 "CANT_OPEN",
  668                 "FAULTED",
  669                 "DEGRADED",
  670                 "ONLINE"
  671         };
  672         return names[state];
  673 }
  674 
  675 #ifdef BOOT2
  676 
  677 #define pager_printf printf
  678 
  679 #else
  680 
  681 static void
  682 pager_printf(const char *fmt, ...)
  683 {
  684         char line[80];
  685         va_list args;
  686 
  687         va_start(args, fmt);
  688         vsprintf(line, fmt, args);
  689         va_end(args);
  690         pager_output(line);
  691 }
  692 
  693 #endif
  694 
  695 #define STATUS_FORMAT   "        %s %s\n"
  696 
  697 static void
  698 print_state(int indent, const char *name, vdev_state_t state)
  699 {
  700         int i;
  701         char buf[512];
  702 
  703         buf[0] = 0;
  704         for (i = 0; i < indent; i++)
  705                 strcat(buf, "  ");
  706         strcat(buf, name);
  707         pager_printf(STATUS_FORMAT, buf, state_name(state));
  708         
  709 }
  710 
  711 static void
  712 vdev_status(vdev_t *vdev, int indent)
  713 {
  714         vdev_t *kid;
  715         print_state(indent, vdev->v_name, vdev->v_state);
  716 
  717         STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
  718                 vdev_status(kid, indent + 1);
  719         }
  720 }
  721 
  722 static void
  723 spa_status(spa_t *spa)
  724 {
  725         vdev_t *vdev;
  726         int good_kids, bad_kids, degraded_kids;
  727         vdev_state_t state;
  728 
  729         pager_printf("  pool: %s\n", spa->spa_name);
  730         pager_printf("config:\n\n");
  731         pager_printf(STATUS_FORMAT, "NAME", "STATE");
  732 
  733         good_kids = 0;
  734         degraded_kids = 0;
  735         bad_kids = 0;
  736         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  737                 if (vdev->v_state == VDEV_STATE_HEALTHY)
  738                         good_kids++;
  739                 else if (vdev->v_state == VDEV_STATE_DEGRADED)
  740                         degraded_kids++;
  741                 else
  742                         bad_kids++;
  743         }
  744 
  745         state = VDEV_STATE_CLOSED;
  746         if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
  747                 state = VDEV_STATE_HEALTHY;
  748         else if ((good_kids + degraded_kids) > 0)
  749                 state = VDEV_STATE_DEGRADED;
  750 
  751         print_state(0, spa->spa_name, state);
  752         STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
  753                 vdev_status(vdev, 1);
  754         }
  755 }
  756 
  757 static void
  758 spa_all_status(void)
  759 {
  760         spa_t *spa;
  761         int first = 1;
  762 
  763         STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
  764                 if (!first)
  765                         pager_printf("\n");
  766                 first = 0;
  767                 spa_status(spa);
  768         }
  769 }
  770 
  771 static int
  772 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
  773 {
  774         vdev_t vtmp;
  775         vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
  776         spa_t *spa;
  777         vdev_t *vdev, *top_vdev, *pool_vdev;
  778         off_t off;
  779         blkptr_t bp;
  780         const unsigned char *nvlist;
  781         uint64_t val;
  782         uint64_t guid;
  783         uint64_t pool_txg, pool_guid;
  784         uint64_t is_log;
  785         const char *pool_name;
  786         const unsigned char *vdevs;
  787         int i, rc, is_newer;
  788         char *upbuf;
  789         const struct uberblock *up;
  790 
  791         /*
  792          * Load the vdev label and figure out which
  793          * uberblock is most current.
  794          */
  795         memset(&vtmp, 0, sizeof(vtmp));
  796         vtmp.v_phys_read = read;
  797         vtmp.v_read_priv = read_priv;
  798         off = offsetof(vdev_label_t, vl_vdev_phys);
  799         BP_ZERO(&bp);
  800         BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
  801         BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
  802         BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  803         BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  804         DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
  805         ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  806         if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
  807                 return (EIO);
  808 
  809         if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
  810                 return (EIO);
  811         }
  812 
  813         nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
  814 
  815         if (nvlist_find(nvlist,
  816                         ZPOOL_CONFIG_VERSION,
  817                         DATA_TYPE_UINT64, 0, &val)) {
  818                 return (EIO);
  819         }
  820 
  821         if (val > SPA_VERSION) {
  822                 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
  823                     (unsigned) val, (unsigned) SPA_VERSION);
  824                 return (EIO);
  825         }
  826 
  827         if (nvlist_find(nvlist,
  828                         ZPOOL_CONFIG_POOL_STATE,
  829                         DATA_TYPE_UINT64, 0, &val)) {
  830                 return (EIO);
  831         }
  832 
  833         if (val == POOL_STATE_DESTROYED) {
  834                 /* We don't boot only from destroyed pools. */
  835                 return (EIO);
  836         }
  837 
  838         if (nvlist_find(nvlist,
  839                         ZPOOL_CONFIG_POOL_TXG,
  840                         DATA_TYPE_UINT64, 0, &pool_txg)
  841             || nvlist_find(nvlist,
  842                            ZPOOL_CONFIG_POOL_GUID,
  843                            DATA_TYPE_UINT64, 0, &pool_guid)
  844             || nvlist_find(nvlist,
  845                            ZPOOL_CONFIG_POOL_NAME,
  846                            DATA_TYPE_STRING, 0, &pool_name)) {
  847                 /*
  848                  * Cache and spare devices end up here - just ignore
  849                  * them.
  850                  */
  851                 /*printf("ZFS: can't find pool details\n");*/
  852                 return (EIO);
  853         }
  854 
  855         is_log = 0;
  856         (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
  857             &is_log);
  858         if (is_log)
  859                 return (EIO);
  860 
  861         /*
  862          * Create the pool if this is the first time we've seen it.
  863          */
  864         spa = spa_find_by_guid(pool_guid);
  865         if (!spa) {
  866                 spa = spa_create(pool_guid);
  867                 spa->spa_name = strdup(pool_name);
  868         }
  869         if (pool_txg > spa->spa_txg) {
  870                 spa->spa_txg = pool_txg;
  871                 is_newer = 1;
  872         } else
  873                 is_newer = 0;
  874 
  875         /*
  876          * Get the vdev tree and create our in-core copy of it.
  877          * If we already have a vdev with this guid, this must
  878          * be some kind of alias (overlapping slices, dangerously dedicated
  879          * disks etc).
  880          */
  881         if (nvlist_find(nvlist,
  882                         ZPOOL_CONFIG_GUID,
  883                         DATA_TYPE_UINT64, 0, &guid)) {
  884                 return (EIO);
  885         }
  886         vdev = vdev_find(guid);
  887         if (vdev && vdev->v_phys_read)  /* Has this vdev already been inited? */
  888                 return (EIO);
  889 
  890         if (nvlist_find(nvlist,
  891                         ZPOOL_CONFIG_VDEV_TREE,
  892                         DATA_TYPE_NVLIST, 0, &vdevs)) {
  893                 return (EIO);
  894         }
  895 
  896         rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
  897         if (rc)
  898                 return (rc);
  899 
  900         /*
  901          * Add the toplevel vdev to the pool if its not already there.
  902          */
  903         STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
  904                 if (top_vdev == pool_vdev)
  905                         break;
  906         if (!pool_vdev && top_vdev)
  907                 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
  908 
  909         /*
  910          * We should already have created an incomplete vdev for this
  911          * vdev. Find it and initialise it with our read proc.
  912          */
  913         vdev = vdev_find(guid);
  914         if (vdev) {
  915                 vdev->v_phys_read = read;
  916                 vdev->v_read_priv = read_priv;
  917                 vdev->v_state = VDEV_STATE_HEALTHY;
  918         } else {
  919                 printf("ZFS: inconsistent nvlist contents\n");
  920                 return (EIO);
  921         }
  922 
  923         /*
  924          * Re-evaluate top-level vdev state.
  925          */
  926         vdev_set_state(top_vdev);
  927 
  928         /*
  929          * Ok, we are happy with the pool so far. Lets find
  930          * the best uberblock and then we can actually access
  931          * the contents of the pool.
  932          */
  933         upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
  934         up = (const struct uberblock *)upbuf;
  935         for (i = 0;
  936              i < VDEV_UBERBLOCK_COUNT(vdev);
  937              i++) {
  938                 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
  939                 BP_ZERO(&bp);
  940                 DVA_SET_OFFSET(&bp.blk_dva[0], off);
  941                 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  942                 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
  943                 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
  944                 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
  945                 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
  946 
  947                 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
  948                         continue;
  949 
  950                 if (up->ub_magic != UBERBLOCK_MAGIC)
  951                         continue;
  952                 if (up->ub_txg < spa->spa_txg)
  953                         continue;
  954                 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
  955                         spa->spa_uberblock = *up;
  956                 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
  957                         if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
  958                                 spa->spa_uberblock = *up;
  959                 }
  960         }
  961         zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
  962 
  963         if (spap)
  964                 *spap = spa;
  965         return (0);
  966 }
  967 
  968 static int
  969 ilog2(int n)
  970 {
  971         int v;
  972 
  973         for (v = 0; v < 32; v++)
  974                 if (n == (1 << v))
  975                         return v;
  976         return -1;
  977 }
  978 
  979 static int
  980 zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
  981 {
  982         blkptr_t gbh_bp;
  983         zio_gbh_phys_t zio_gb;
  984         char *pbuf;
  985         int i;
  986 
  987         /* Artificial BP for gang block header. */
  988         gbh_bp = *bp;
  989         BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  990         BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
  991         BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
  992         BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
  993         for (i = 0; i < SPA_DVAS_PER_BP; i++)
  994                 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
  995 
  996         /* Read gang header block using the artificial BP. */
  997         if (zio_read(spa, &gbh_bp, &zio_gb))
  998                 return (EIO);
  999 
 1000         pbuf = buf;
 1001         for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
 1002                 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
 1003 
 1004                 if (BP_IS_HOLE(gbp))
 1005                         continue;
 1006                 if (zio_read(spa, gbp, pbuf))
 1007                         return (EIO);
 1008                 pbuf += BP_GET_PSIZE(gbp);
 1009         }
 1010 
 1011         if (zio_checksum_verify(bp, buf))
 1012                 return (EIO);
 1013         return (0);
 1014 }
 1015 
 1016 static int
 1017 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
 1018 {
 1019         int cpfunc = BP_GET_COMPRESS(bp);
 1020         uint64_t align, size;
 1021         void *pbuf;
 1022         int i, error;
 1023 
 1024         error = EIO;
 1025 
 1026         for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 1027                 const dva_t *dva = &bp->blk_dva[i];
 1028                 vdev_t *vdev;
 1029                 int vdevid;
 1030                 off_t offset;
 1031 
 1032                 if (!dva->dva_word[0] && !dva->dva_word[1])
 1033                         continue;
 1034 
 1035                 vdevid = DVA_GET_VDEV(dva);
 1036                 offset = DVA_GET_OFFSET(dva);
 1037                 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
 1038                         if (vdev->v_id == vdevid)
 1039                                 break;
 1040                 }
 1041                 if (!vdev || !vdev->v_read)
 1042                         continue;
 1043 
 1044                 size = BP_GET_PSIZE(bp);
 1045                 if (vdev->v_read == vdev_raidz_read) {
 1046                         align = 1ULL << vdev->v_top->v_ashift;
 1047                         if (P2PHASE(size, align) != 0)
 1048                                 size = P2ROUNDUP(size, align);
 1049                 }
 1050                 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
 1051                         pbuf = zfs_alloc(size);
 1052                 else
 1053                         pbuf = buf;
 1054 
 1055                 if (DVA_GET_GANG(dva))
 1056                         error = zio_read_gang(spa, bp, pbuf);
 1057                 else
 1058                         error = vdev->v_read(vdev, bp, pbuf, offset, size);
 1059                 if (error == 0) {
 1060                         if (cpfunc != ZIO_COMPRESS_OFF)
 1061                                 error = zio_decompress_data(cpfunc, pbuf,
 1062                                     BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
 1063                         else if (size != BP_GET_PSIZE(bp))
 1064                                 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
 1065                 }
 1066                 if (buf != pbuf)
 1067                         zfs_free(pbuf, size);
 1068                 if (error == 0)
 1069                         break;
 1070         }
 1071         if (error != 0)
 1072                 printf("ZFS: i/o error - all block copies unavailable\n");
 1073         return (error);
 1074 }
 1075 
 1076 static int
 1077 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
 1078 {
 1079         int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 1080         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1081         int nlevels = dnode->dn_nlevels;
 1082         int i, rc;
 1083 
 1084         /*
 1085          * Note: bsize may not be a power of two here so we need to do an
 1086          * actual divide rather than a bitshift.
 1087          */
 1088         while (buflen > 0) {
 1089                 uint64_t bn = offset / bsize;
 1090                 int boff = offset % bsize;
 1091                 int ibn;
 1092                 const blkptr_t *indbp;
 1093                 blkptr_t bp;
 1094 
 1095                 if (bn > dnode->dn_maxblkid)
 1096                         return (EIO);
 1097 
 1098                 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
 1099                         goto cached;
 1100 
 1101                 indbp = dnode->dn_blkptr;
 1102                 for (i = 0; i < nlevels; i++) {
 1103                         /*
 1104                          * Copy the bp from the indirect array so that
 1105                          * we can re-use the scratch buffer for multi-level
 1106                          * objects.
 1107                          */
 1108                         ibn = bn >> ((nlevels - i - 1) * ibshift);
 1109                         ibn &= ((1 << ibshift) - 1);
 1110                         bp = indbp[ibn];
 1111                         rc = zio_read(spa, &bp, dnode_cache_buf);
 1112                         if (rc)
 1113                                 return (rc);
 1114                         indbp = (const blkptr_t *) dnode_cache_buf;
 1115                 }
 1116                 dnode_cache_obj = dnode;
 1117                 dnode_cache_bn = bn;
 1118         cached:
 1119 
 1120                 /*
 1121                  * The buffer contains our data block. Copy what we
 1122                  * need from it and loop.
 1123                  */ 
 1124                 i = bsize - boff;
 1125                 if (i > buflen) i = buflen;
 1126                 memcpy(buf, &dnode_cache_buf[boff], i);
 1127                 buf = ((char*) buf) + i;
 1128                 offset += i;
 1129                 buflen -= i;
 1130         }
 1131 
 1132         return (0);
 1133 }
 1134 
 1135 /*
 1136  * Lookup a value in a microzap directory. Assumes that the zap
 1137  * scratch buffer contains the directory contents.
 1138  */
 1139 static int
 1140 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1141 {
 1142         const mzap_phys_t *mz;
 1143         const mzap_ent_phys_t *mze;
 1144         size_t size;
 1145         int chunks, i;
 1146 
 1147         /*
 1148          * Microzap objects use exactly one block. Read the whole
 1149          * thing.
 1150          */
 1151         size = dnode->dn_datablkszsec * 512;
 1152 
 1153         mz = (const mzap_phys_t *) zap_scratch;
 1154         chunks = size / MZAP_ENT_LEN - 1;
 1155 
 1156         for (i = 0; i < chunks; i++) {
 1157                 mze = &mz->mz_chunk[i];
 1158                 if (!strcmp(mze->mze_name, name)) {
 1159                         *value = mze->mze_value;
 1160                         return (0);
 1161                 }
 1162         }
 1163 
 1164         return (ENOENT);
 1165 }
 1166 
 1167 /*
 1168  * Compare a name with a zap leaf entry. Return non-zero if the name
 1169  * matches.
 1170  */
 1171 static int
 1172 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
 1173 {
 1174         size_t namelen;
 1175         const zap_leaf_chunk_t *nc;
 1176         const char *p;
 1177 
 1178         namelen = zc->l_entry.le_name_length;
 1179                         
 1180         nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
 1181         p = name;
 1182         while (namelen > 0) {
 1183                 size_t len;
 1184                 len = namelen;
 1185                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1186                         len = ZAP_LEAF_ARRAY_BYTES;
 1187                 if (memcmp(p, nc->l_array.la_array, len))
 1188                         return (0);
 1189                 p += len;
 1190                 namelen -= len;
 1191                 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
 1192         }
 1193 
 1194         return 1;
 1195 }
 1196 
 1197 /*
 1198  * Extract a uint64_t value from a zap leaf entry.
 1199  */
 1200 static uint64_t
 1201 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
 1202 {
 1203         const zap_leaf_chunk_t *vc;
 1204         int i;
 1205         uint64_t value;
 1206         const uint8_t *p;
 1207 
 1208         vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
 1209         for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
 1210                 value = (value << 8) | p[i];
 1211         }
 1212 
 1213         return value;
 1214 }
 1215 
 1216 /*
 1217  * Lookup a value in a fatzap directory. Assumes that the zap scratch
 1218  * buffer contains the directory header.
 1219  */
 1220 static int
 1221 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1222 {
 1223         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1224         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1225         fat_zap_t z;
 1226         uint64_t *ptrtbl;
 1227         uint64_t hash;
 1228         int rc;
 1229 
 1230         if (zh.zap_magic != ZAP_MAGIC)
 1231                 return (EIO);
 1232 
 1233         z.zap_block_shift = ilog2(bsize);
 1234         z.zap_phys = (zap_phys_t *) zap_scratch;
 1235 
 1236         /*
 1237          * Figure out where the pointer table is and read it in if necessary.
 1238          */
 1239         if (zh.zap_ptrtbl.zt_blk) {
 1240                 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
 1241                                zap_scratch, bsize);
 1242                 if (rc)
 1243                         return (rc);
 1244                 ptrtbl = (uint64_t *) zap_scratch;
 1245         } else {
 1246                 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
 1247         }
 1248 
 1249         hash = zap_hash(zh.zap_salt, name);
 1250 
 1251         zap_leaf_t zl;
 1252         zl.l_bs = z.zap_block_shift;
 1253 
 1254         off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
 1255         zap_leaf_chunk_t *zc;
 1256 
 1257         rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
 1258         if (rc)
 1259                 return (rc);
 1260 
 1261         zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1262 
 1263         /*
 1264          * Make sure this chunk matches our hash.
 1265          */
 1266         if (zl.l_phys->l_hdr.lh_prefix_len > 0
 1267             && zl.l_phys->l_hdr.lh_prefix
 1268             != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
 1269                 return (ENOENT);
 1270 
 1271         /*
 1272          * Hash within the chunk to find our entry.
 1273          */
 1274         int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
 1275         int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
 1276         h = zl.l_phys->l_hash[h];
 1277         if (h == 0xffff)
 1278                 return (ENOENT);
 1279         zc = &ZAP_LEAF_CHUNK(&zl, h);
 1280         while (zc->l_entry.le_hash != hash) {
 1281                 if (zc->l_entry.le_next == 0xffff) {
 1282                         zc = 0;
 1283                         break;
 1284                 }
 1285                 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
 1286         }
 1287         if (fzap_name_equal(&zl, zc, name)) {
 1288                 *value = fzap_leaf_value(&zl, zc);
 1289                 return (0);
 1290         }
 1291 
 1292         return (ENOENT);
 1293 }
 1294 
 1295 /*
 1296  * Lookup a name in a zap object and return its value as a uint64_t.
 1297  */
 1298 static int
 1299 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
 1300 {
 1301         int rc;
 1302         uint64_t zap_type;
 1303         size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1304 
 1305         rc = dnode_read(spa, dnode, 0, zap_scratch, size);
 1306         if (rc)
 1307                 return (rc);
 1308 
 1309         zap_type = *(uint64_t *) zap_scratch;
 1310         if (zap_type == ZBT_MICRO)
 1311                 return mzap_lookup(spa, dnode, name, value);
 1312         else if (zap_type == ZBT_HEADER)
 1313                 return fzap_lookup(spa, dnode, name, value);
 1314         printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
 1315         return (EIO);
 1316 }
 1317 
 1318 #ifdef BOOT2
 1319 
 1320 /*
 1321  * List a microzap directory. Assumes that the zap scratch buffer contains
 1322  * the directory contents.
 1323  */
 1324 static int
 1325 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1326 {
 1327         const mzap_phys_t *mz;
 1328         const mzap_ent_phys_t *mze;
 1329         size_t size;
 1330         int chunks, i;
 1331 
 1332         /*
 1333          * Microzap objects use exactly one block. Read the whole
 1334          * thing.
 1335          */
 1336         size = dnode->dn_datablkszsec * 512;
 1337         mz = (const mzap_phys_t *) zap_scratch;
 1338         chunks = size / MZAP_ENT_LEN - 1;
 1339 
 1340         for (i = 0; i < chunks; i++) {
 1341                 mze = &mz->mz_chunk[i];
 1342                 if (mze->mze_name[0])
 1343                         //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
 1344                         printf("%s\n", mze->mze_name);
 1345         }
 1346 
 1347         return (0);
 1348 }
 1349 
 1350 /*
 1351  * List a fatzap directory. Assumes that the zap scratch buffer contains
 1352  * the directory header.
 1353  */
 1354 static int
 1355 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
 1356 {
 1357         int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 1358         zap_phys_t zh = *(zap_phys_t *) zap_scratch;
 1359         fat_zap_t z;
 1360         int i, j;
 1361 
 1362         if (zh.zap_magic != ZAP_MAGIC)
 1363                 return (EIO);
 1364 
 1365         z.zap_block_shift = ilog2(bsize);
 1366         z.zap_phys = (zap_phys_t *) zap_scratch;
 1367 
 1368         /*
 1369          * This assumes that the leaf blocks start at block 1. The
 1370          * documentation isn't exactly clear on this.
 1371          */
 1372         zap_leaf_t zl;
 1373         zl.l_bs = z.zap_block_shift;
 1374         for (i = 0; i < zh.zap_num_leafs; i++) {
 1375                 off_t off = (i + 1) << zl.l_bs;
 1376                 char name[256], *p;
 1377                 uint64_t value;
 1378 
 1379                 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
 1380                         return (EIO);
 1381 
 1382                 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
 1383 
 1384                 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
 1385                         zap_leaf_chunk_t *zc, *nc;
 1386                         int namelen;
 1387 
 1388                         zc = &ZAP_LEAF_CHUNK(&zl, j);
 1389                         if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
 1390                                 continue;
 1391                         namelen = zc->l_entry.le_name_length;
 1392                         if (namelen > sizeof(name))
 1393                                 namelen = sizeof(name);
 1394                         
 1395                         /*
 1396                          * Paste the name back together.
 1397                          */
 1398                         nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
 1399                         p = name;
 1400                         while (namelen > 0) {
 1401                                 int len;
 1402                                 len = namelen;
 1403                                 if (len > ZAP_LEAF_ARRAY_BYTES)
 1404                                         len = ZAP_LEAF_ARRAY_BYTES;
 1405                                 memcpy(p, nc->l_array.la_array, len);
 1406                                 p += len;
 1407                                 namelen -= len;
 1408                                 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
 1409                         }
 1410 
 1411                         /*
 1412                          * Assume the first eight bytes of the value are
 1413                          * a uint64_t.
 1414                          */
 1415                         value = fzap_leaf_value(&zl, zc);
 1416 
 1417                         printf("%s 0x%llx\n", name, value);
 1418                 }
 1419         }
 1420 
 1421         return (0);
 1422 }
 1423 
 1424 /*
 1425  * List a zap directory.
 1426  */
 1427 static int
 1428 zap_list(spa_t *spa, const dnode_phys_t *dnode)
 1429 {
 1430         uint64_t zap_type;
 1431         size_t size = dnode->dn_datablkszsec * 512;
 1432 
 1433         if (dnode_read(spa, dnode, 0, zap_scratch, size))
 1434                 return (EIO);
 1435 
 1436         zap_type = *(uint64_t *) zap_scratch;
 1437         if (zap_type == ZBT_MICRO)
 1438                 return mzap_list(spa, dnode);
 1439         else
 1440                 return fzap_list(spa, dnode);
 1441 }
 1442 
 1443 #endif
 1444 
 1445 static int
 1446 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
 1447 {
 1448         off_t offset;
 1449 
 1450         offset = objnum * sizeof(dnode_phys_t);
 1451         return dnode_read(spa, &os->os_meta_dnode, offset,
 1452                 dnode, sizeof(dnode_phys_t));
 1453 }
 1454 
 1455 /*
 1456  * Find the object set given the object number of its dataset object
 1457  * and return its details in *objset
 1458  */
 1459 static int
 1460 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
 1461 {
 1462         dnode_phys_t dataset;
 1463         dsl_dataset_phys_t *ds;
 1464 
 1465         if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
 1466                 printf("ZFS: can't find dataset %llu\n", objnum);
 1467                 return (EIO);
 1468         }
 1469 
 1470         ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
 1471         if (zio_read(spa, &ds->ds_bp, objset)) {
 1472                 printf("ZFS: can't read object set for dataset %llu\n", objnum);
 1473                 return (EIO);
 1474         }
 1475 
 1476         return (0);
 1477 }
 1478 
 1479 /*
 1480  * Find the object set pointed to by the BOOTFS property or the root
 1481  * dataset if there is none and return its details in *objset
 1482  */
 1483 static int
 1484 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
 1485 {
 1486         dnode_phys_t dir, propdir;
 1487         uint64_t props, bootfs, root;
 1488 
 1489         /*
 1490          * Start with the MOS directory object.
 1491          */
 1492         if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
 1493                 printf("ZFS: can't read MOS object directory\n");
 1494                 return (EIO);
 1495         }
 1496 
 1497         /*
 1498          * Lookup the pool_props and see if we can find a bootfs.
 1499          */
 1500         if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
 1501              && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
 1502              && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
 1503              && bootfs != 0)
 1504                 return zfs_mount_dataset(spa, bootfs, objset);
 1505 
 1506         /*
 1507          * Lookup the root dataset directory
 1508          */
 1509         if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
 1510             || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
 1511                 printf("ZFS: can't find root dsl_dir\n");
 1512                 return (EIO);
 1513         }
 1514 
 1515         /*
 1516          * Use the information from the dataset directory's bonus buffer
 1517          * to find the dataset object and from that the object set itself.
 1518          */
 1519         dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
 1520         return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
 1521 }
 1522 
 1523 static int
 1524 zfs_mount_pool(spa_t *spa)
 1525 {
 1526 
 1527         /*
 1528          * Find the MOS and work our way in from there.
 1529          */
 1530         if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
 1531                 printf("ZFS: can't read MOS\n");
 1532                 return (EIO);
 1533         }
 1534 
 1535         /*
 1536          * Find the root object set
 1537          */
 1538         if (zfs_mount_root(spa, &spa->spa_root_objset)) {
 1539                 printf("Can't find root filesystem - giving up\n");
 1540                 return (EIO);
 1541         }
 1542 
 1543         return (0);
 1544 }
 1545 
 1546 static int
 1547 zfs_dnode_stat(spa_t *spa, dnode_phys_t *dn, struct stat *sb)
 1548 {
 1549 
 1550         if (dn->dn_bonustype != DMU_OT_SA) {
 1551                 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
 1552 
 1553                 sb->st_mode = zp->zp_mode;
 1554                 sb->st_uid = zp->zp_uid;
 1555                 sb->st_gid = zp->zp_gid;
 1556                 sb->st_size = zp->zp_size;
 1557         } else {
 1558                 sa_hdr_phys_t *sahdrp;
 1559                 int hdrsize;
 1560                 size_t size = 0;
 1561                 void *buf = NULL;
 1562 
 1563                 if (dn->dn_bonuslen != 0)
 1564                         sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
 1565                 else {
 1566                         if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
 1567                                 blkptr_t *bp = &dn->dn_spill;
 1568                                 int error;
 1569 
 1570                                 size = BP_GET_LSIZE(bp);
 1571                                 buf = zfs_alloc(size);
 1572                                 error = zio_read(spa, bp, buf);
 1573                                 if (error != 0) {
 1574                                         zfs_free(buf, size);
 1575                                         return (error);
 1576                                 }
 1577                                 sahdrp = buf;
 1578                         } else {
 1579                                 return (EIO);
 1580                         }
 1581                 }
 1582                 hdrsize = SA_HDR_SIZE(sahdrp);
 1583                 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
 1584                     SA_MODE_OFFSET);
 1585                 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1586                     SA_UID_OFFSET);
 1587                 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
 1588                     SA_GID_OFFSET);
 1589                 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
 1590                     SA_SIZE_OFFSET);
 1591                 if (buf != NULL)
 1592                         zfs_free(buf, size);
 1593         }
 1594 
 1595         return (0);
 1596 }
 1597 
 1598 /*
 1599  * Lookup a file and return its dnode.
 1600  */
 1601 static int
 1602 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
 1603 {
 1604         int rc;
 1605         uint64_t objnum, rootnum, parentnum;
 1606         dnode_phys_t dn;
 1607         const char *p, *q;
 1608         char element[256];
 1609         char path[1024];
 1610         int symlinks_followed = 0;
 1611         struct stat sb;
 1612 
 1613         if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
 1614                 printf("ZFS: unexpected object set type %llu\n",
 1615                        spa->spa_root_objset.os_type);
 1616                 return (EIO);
 1617         }
 1618 
 1619         /*
 1620          * Get the root directory dnode.
 1621          */
 1622         rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
 1623         if (rc)
 1624                 return (rc);
 1625 
 1626         rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
 1627         if (rc)
 1628                 return (rc);
 1629 
 1630         rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
 1631         if (rc)
 1632                 return (rc);
 1633 
 1634         objnum = rootnum;
 1635         p = upath;
 1636         while (p && *p) {
 1637                 while (*p == '/')
 1638                         p++;
 1639                 if (!*p)
 1640                         break;
 1641                 q = strchr(p, '/');
 1642                 if (q) {
 1643                         memcpy(element, p, q - p);
 1644                         element[q - p] = 0;
 1645                         p = q;
 1646                 } else {
 1647                         strcpy(element, p);
 1648                         p = 0;
 1649                 }
 1650 
 1651                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1652                 if (rc)
 1653                         return (rc);
 1654                 if (!S_ISDIR(sb.st_mode))
 1655                         return (ENOTDIR);
 1656 
 1657                 parentnum = objnum;
 1658                 rc = zap_lookup(spa, &dn, element, &objnum);
 1659                 if (rc)
 1660                         return (rc);
 1661                 objnum = ZFS_DIRENT_OBJ(objnum);
 1662 
 1663                 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1664                 if (rc)
 1665                         return (rc);
 1666 
 1667                 /*
 1668                  * Check for symlink.
 1669                  */
 1670                 rc = zfs_dnode_stat(spa, &dn, &sb);
 1671                 if (rc)
 1672                         return (rc);
 1673                 if (S_ISLNK(sb.st_mode)) {
 1674                         if (symlinks_followed > 10)
 1675                                 return (EMLINK);
 1676                         symlinks_followed++;
 1677 
 1678                         /*
 1679                          * Read the link value and copy the tail of our
 1680                          * current path onto the end.
 1681                          */
 1682                         if (p)
 1683                                 strcpy(&path[sb.st_size], p);
 1684                         else
 1685                                 path[sb.st_size] = 0;
 1686                         if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
 1687                                 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
 1688                                         sb.st_size);
 1689                         } else {
 1690                                 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
 1691                                 if (rc)
 1692                                         return (rc);
 1693                         }
 1694 
 1695                         /*
 1696                          * Restart with the new path, starting either at
 1697                          * the root or at the parent depending whether or
 1698                          * not the link is relative.
 1699                          */
 1700                         p = path;
 1701                         if (*p == '/')
 1702                                 objnum = rootnum;
 1703                         else
 1704                                 objnum = parentnum;
 1705                         objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
 1706                 }
 1707         }
 1708 
 1709         *dnode = dn;
 1710         return (0);
 1711 }

Cache object: 18f77a9046c83b24fd5ca2e2c936091f


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.