FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c
1 /*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29
30 /*
31 * Stand-alone ZFS file reader.
32 */
33
34 #include <sys/stat.h>
35 #include <sys/stdint.h>
36
37 #include "zfsimpl.h"
38 #include "zfssubr.c"
39
40
41 struct zfsmount {
42 const spa_t *spa;
43 objset_phys_t objset;
44 uint64_t rootobj;
45 };
46
47 /*
48 * List of all vdevs, chained through v_alllink.
49 */
50 static vdev_list_t zfs_vdevs;
51
52 /*
53 * List of ZFS features supported for read
54 */
55 static const char *features_for_read[] = {
56 "org.illumos:lz4_compress",
57 "com.delphix:hole_birth",
58 "com.delphix:extensible_dataset",
59 NULL
60 };
61
62 /*
63 * List of all pools, chained through spa_link.
64 */
65 static spa_list_t zfs_pools;
66
67 static uint64_t zfs_crc64_table[256];
68 static const dnode_phys_t *dnode_cache_obj = 0;
69 static uint64_t dnode_cache_bn;
70 static char *dnode_cache_buf;
71 static char *zap_scratch;
72 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
73
74 #define TEMP_SIZE (1024 * 1024)
75
76 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
77 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
78 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
79
80 static void
81 zfs_init(void)
82 {
83 STAILQ_INIT(&zfs_vdevs);
84 STAILQ_INIT(&zfs_pools);
85
86 zfs_temp_buf = malloc(TEMP_SIZE);
87 zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
88 zfs_temp_ptr = zfs_temp_buf;
89 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
90 zap_scratch = malloc(SPA_MAXBLOCKSIZE);
91
92 zfs_init_crc();
93 }
94
95 static void *
96 zfs_alloc(size_t size)
97 {
98 char *ptr;
99
100 if (zfs_temp_ptr + size > zfs_temp_end) {
101 printf("ZFS: out of temporary buffer space\n");
102 for (;;) ;
103 }
104 ptr = zfs_temp_ptr;
105 zfs_temp_ptr += size;
106
107 return (ptr);
108 }
109
110 static void
111 zfs_free(void *ptr, size_t size)
112 {
113
114 zfs_temp_ptr -= size;
115 if (zfs_temp_ptr != ptr) {
116 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
117 for (;;) ;
118 }
119 }
120
121 static int
122 xdr_int(const unsigned char **xdr, int *ip)
123 {
124 *ip = ((*xdr)[0] << 24)
125 | ((*xdr)[1] << 16)
126 | ((*xdr)[2] << 8)
127 | ((*xdr)[3] << 0);
128 (*xdr) += 4;
129 return (0);
130 }
131
132 static int
133 xdr_u_int(const unsigned char **xdr, u_int *ip)
134 {
135 *ip = ((*xdr)[0] << 24)
136 | ((*xdr)[1] << 16)
137 | ((*xdr)[2] << 8)
138 | ((*xdr)[3] << 0);
139 (*xdr) += 4;
140 return (0);
141 }
142
143 static int
144 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
145 {
146 u_int hi, lo;
147
148 xdr_u_int(xdr, &hi);
149 xdr_u_int(xdr, &lo);
150 *lp = (((uint64_t) hi) << 32) | lo;
151 return (0);
152 }
153
154 static int
155 nvlist_find(const unsigned char *nvlist, const char *name, int type,
156 int* elementsp, void *valuep)
157 {
158 const unsigned char *p, *pair;
159 int junk;
160 int encoded_size, decoded_size;
161
162 p = nvlist;
163 xdr_int(&p, &junk);
164 xdr_int(&p, &junk);
165
166 pair = p;
167 xdr_int(&p, &encoded_size);
168 xdr_int(&p, &decoded_size);
169 while (encoded_size && decoded_size) {
170 int namelen, pairtype, elements;
171 const char *pairname;
172
173 xdr_int(&p, &namelen);
174 pairname = (const char*) p;
175 p += roundup(namelen, 4);
176 xdr_int(&p, &pairtype);
177
178 if (!memcmp(name, pairname, namelen) && type == pairtype) {
179 xdr_int(&p, &elements);
180 if (elementsp)
181 *elementsp = elements;
182 if (type == DATA_TYPE_UINT64) {
183 xdr_uint64_t(&p, (uint64_t *) valuep);
184 return (0);
185 } else if (type == DATA_TYPE_STRING) {
186 int len;
187 xdr_int(&p, &len);
188 (*(const char**) valuep) = (const char*) p;
189 return (0);
190 } else if (type == DATA_TYPE_NVLIST
191 || type == DATA_TYPE_NVLIST_ARRAY) {
192 (*(const unsigned char**) valuep) =
193 (const unsigned char*) p;
194 return (0);
195 } else {
196 return (EIO);
197 }
198 } else {
199 /*
200 * Not the pair we are looking for, skip to the next one.
201 */
202 p = pair + encoded_size;
203 }
204
205 pair = p;
206 xdr_int(&p, &encoded_size);
207 xdr_int(&p, &decoded_size);
208 }
209
210 return (EIO);
211 }
212
213 static int
214 nvlist_check_features_for_read(const unsigned char *nvlist)
215 {
216 const unsigned char *p, *pair;
217 int junk;
218 int encoded_size, decoded_size;
219 int rc;
220
221 rc = 0;
222
223 p = nvlist;
224 xdr_int(&p, &junk);
225 xdr_int(&p, &junk);
226
227 pair = p;
228 xdr_int(&p, &encoded_size);
229 xdr_int(&p, &decoded_size);
230 while (encoded_size && decoded_size) {
231 int namelen, pairtype;
232 const char *pairname;
233 int i, found;
234
235 found = 0;
236
237 xdr_int(&p, &namelen);
238 pairname = (const char*) p;
239 p += roundup(namelen, 4);
240 xdr_int(&p, &pairtype);
241
242 for (i = 0; features_for_read[i] != NULL; i++) {
243 if (!memcmp(pairname, features_for_read[i], namelen)) {
244 found = 1;
245 break;
246 }
247 }
248
249 if (!found) {
250 printf("ZFS: unsupported feature: %s\n", pairname);
251 rc = EIO;
252 }
253
254 p = pair + encoded_size;
255
256 pair = p;
257 xdr_int(&p, &encoded_size);
258 xdr_int(&p, &decoded_size);
259 }
260
261 return (rc);
262 }
263
264 /*
265 * Return the next nvlist in an nvlist array.
266 */
267 static const unsigned char *
268 nvlist_next(const unsigned char *nvlist)
269 {
270 const unsigned char *p, *pair;
271 int junk;
272 int encoded_size, decoded_size;
273
274 p = nvlist;
275 xdr_int(&p, &junk);
276 xdr_int(&p, &junk);
277
278 pair = p;
279 xdr_int(&p, &encoded_size);
280 xdr_int(&p, &decoded_size);
281 while (encoded_size && decoded_size) {
282 p = pair + encoded_size;
283
284 pair = p;
285 xdr_int(&p, &encoded_size);
286 xdr_int(&p, &decoded_size);
287 }
288
289 return p;
290 }
291
292 #ifdef TEST
293
294 static const unsigned char *
295 nvlist_print(const unsigned char *nvlist, unsigned int indent)
296 {
297 static const char* typenames[] = {
298 "DATA_TYPE_UNKNOWN",
299 "DATA_TYPE_BOOLEAN",
300 "DATA_TYPE_BYTE",
301 "DATA_TYPE_INT16",
302 "DATA_TYPE_UINT16",
303 "DATA_TYPE_INT32",
304 "DATA_TYPE_UINT32",
305 "DATA_TYPE_INT64",
306 "DATA_TYPE_UINT64",
307 "DATA_TYPE_STRING",
308 "DATA_TYPE_BYTE_ARRAY",
309 "DATA_TYPE_INT16_ARRAY",
310 "DATA_TYPE_UINT16_ARRAY",
311 "DATA_TYPE_INT32_ARRAY",
312 "DATA_TYPE_UINT32_ARRAY",
313 "DATA_TYPE_INT64_ARRAY",
314 "DATA_TYPE_UINT64_ARRAY",
315 "DATA_TYPE_STRING_ARRAY",
316 "DATA_TYPE_HRTIME",
317 "DATA_TYPE_NVLIST",
318 "DATA_TYPE_NVLIST_ARRAY",
319 "DATA_TYPE_BOOLEAN_VALUE",
320 "DATA_TYPE_INT8",
321 "DATA_TYPE_UINT8",
322 "DATA_TYPE_BOOLEAN_ARRAY",
323 "DATA_TYPE_INT8_ARRAY",
324 "DATA_TYPE_UINT8_ARRAY"
325 };
326
327 unsigned int i, j;
328 const unsigned char *p, *pair;
329 int junk;
330 int encoded_size, decoded_size;
331
332 p = nvlist;
333 xdr_int(&p, &junk);
334 xdr_int(&p, &junk);
335
336 pair = p;
337 xdr_int(&p, &encoded_size);
338 xdr_int(&p, &decoded_size);
339 while (encoded_size && decoded_size) {
340 int namelen, pairtype, elements;
341 const char *pairname;
342
343 xdr_int(&p, &namelen);
344 pairname = (const char*) p;
345 p += roundup(namelen, 4);
346 xdr_int(&p, &pairtype);
347
348 for (i = 0; i < indent; i++)
349 printf(" ");
350 printf("%s %s", typenames[pairtype], pairname);
351
352 xdr_int(&p, &elements);
353 switch (pairtype) {
354 case DATA_TYPE_UINT64: {
355 uint64_t val;
356 xdr_uint64_t(&p, &val);
357 printf(" = 0x%jx\n", (uintmax_t)val);
358 break;
359 }
360
361 case DATA_TYPE_STRING: {
362 int len;
363 xdr_int(&p, &len);
364 printf(" = \"%s\"\n", p);
365 break;
366 }
367
368 case DATA_TYPE_NVLIST:
369 printf("\n");
370 nvlist_print(p, indent + 1);
371 break;
372
373 case DATA_TYPE_NVLIST_ARRAY:
374 for (j = 0; j < elements; j++) {
375 printf("[%d]\n", j);
376 p = nvlist_print(p, indent + 1);
377 if (j != elements - 1) {
378 for (i = 0; i < indent; i++)
379 printf(" ");
380 printf("%s %s", typenames[pairtype], pairname);
381 }
382 }
383 break;
384
385 default:
386 printf("\n");
387 }
388
389 p = pair + encoded_size;
390
391 pair = p;
392 xdr_int(&p, &encoded_size);
393 xdr_int(&p, &decoded_size);
394 }
395
396 return p;
397 }
398
399 #endif
400
401 static int
402 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
403 off_t offset, size_t size)
404 {
405 size_t psize;
406 int rc;
407
408 if (!vdev->v_phys_read)
409 return (EIO);
410
411 if (bp) {
412 psize = BP_GET_PSIZE(bp);
413 } else {
414 psize = size;
415 }
416
417 /*printf("ZFS: reading %d bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
418 rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
419 if (rc)
420 return (rc);
421 if (bp && zio_checksum_verify(bp, buf))
422 return (EIO);
423
424 return (0);
425 }
426
427 static int
428 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
429 off_t offset, size_t bytes)
430 {
431
432 return (vdev_read_phys(vdev, bp, buf,
433 offset + VDEV_LABEL_START_SIZE, bytes));
434 }
435
436
437 static int
438 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
439 off_t offset, size_t bytes)
440 {
441 vdev_t *kid;
442 int rc;
443
444 rc = EIO;
445 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
446 if (kid->v_state != VDEV_STATE_HEALTHY)
447 continue;
448 rc = kid->v_read(kid, bp, buf, offset, bytes);
449 if (!rc)
450 return (0);
451 }
452
453 return (rc);
454 }
455
456 static int
457 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
458 off_t offset, size_t bytes)
459 {
460 vdev_t *kid;
461
462 /*
463 * Here we should have two kids:
464 * First one which is the one we are replacing and we can trust
465 * only this one to have valid data, but it might not be present.
466 * Second one is that one we are replacing with. It is most likely
467 * healthy, but we can't trust it has needed data, so we won't use it.
468 */
469 kid = STAILQ_FIRST(&vdev->v_children);
470 if (kid == NULL)
471 return (EIO);
472 if (kid->v_state != VDEV_STATE_HEALTHY)
473 return (EIO);
474 return (kid->v_read(kid, bp, buf, offset, bytes));
475 }
476
477 static vdev_t *
478 vdev_find(uint64_t guid)
479 {
480 vdev_t *vdev;
481
482 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
483 if (vdev->v_guid == guid)
484 return (vdev);
485
486 return (0);
487 }
488
489 static vdev_t *
490 vdev_create(uint64_t guid, vdev_read_t *read)
491 {
492 vdev_t *vdev;
493
494 vdev = malloc(sizeof(vdev_t));
495 memset(vdev, 0, sizeof(vdev_t));
496 STAILQ_INIT(&vdev->v_children);
497 vdev->v_guid = guid;
498 vdev->v_state = VDEV_STATE_OFFLINE;
499 vdev->v_read = read;
500 vdev->v_phys_read = 0;
501 vdev->v_read_priv = 0;
502 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
503
504 return (vdev);
505 }
506
507 static int
508 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
509 vdev_t **vdevp, int is_newer)
510 {
511 int rc;
512 uint64_t guid, id, ashift, nparity;
513 const char *type;
514 const char *path;
515 vdev_t *vdev, *kid;
516 const unsigned char *kids;
517 int nkids, i, is_new;
518 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
519
520 if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
521 DATA_TYPE_UINT64, 0, &guid)
522 || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
523 DATA_TYPE_UINT64, 0, &id)
524 || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
525 DATA_TYPE_STRING, 0, &type)) {
526 printf("ZFS: can't find vdev details\n");
527 return (ENOENT);
528 }
529
530 if (strcmp(type, VDEV_TYPE_MIRROR)
531 && strcmp(type, VDEV_TYPE_DISK)
532 #ifdef ZFS_TEST
533 && strcmp(type, VDEV_TYPE_FILE)
534 #endif
535 && strcmp(type, VDEV_TYPE_RAIDZ)
536 && strcmp(type, VDEV_TYPE_REPLACING)) {
537 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
538 return (EIO);
539 }
540
541 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
542
543 nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
544 &is_offline);
545 nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
546 &is_removed);
547 nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
548 &is_faulted);
549 nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
550 &is_degraded);
551 nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
552 &isnt_present);
553
554 vdev = vdev_find(guid);
555 if (!vdev) {
556 is_new = 1;
557
558 if (!strcmp(type, VDEV_TYPE_MIRROR))
559 vdev = vdev_create(guid, vdev_mirror_read);
560 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
561 vdev = vdev_create(guid, vdev_raidz_read);
562 else if (!strcmp(type, VDEV_TYPE_REPLACING))
563 vdev = vdev_create(guid, vdev_replacing_read);
564 else
565 vdev = vdev_create(guid, vdev_disk_read);
566
567 vdev->v_id = id;
568 vdev->v_top = pvdev != NULL ? pvdev : vdev;
569 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
570 DATA_TYPE_UINT64, 0, &ashift) == 0)
571 vdev->v_ashift = ashift;
572 else
573 vdev->v_ashift = 0;
574 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
575 DATA_TYPE_UINT64, 0, &nparity) == 0)
576 vdev->v_nparity = nparity;
577 else
578 vdev->v_nparity = 0;
579 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
580 DATA_TYPE_STRING, 0, &path) == 0) {
581 if (strncmp(path, "/dev/", 5) == 0)
582 path += 5;
583 vdev->v_name = strdup(path);
584 } else {
585 if (!strcmp(type, "raidz")) {
586 if (vdev->v_nparity == 1)
587 vdev->v_name = "raidz1";
588 else if (vdev->v_nparity == 2)
589 vdev->v_name = "raidz2";
590 else if (vdev->v_nparity == 3)
591 vdev->v_name = "raidz3";
592 else {
593 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
594 return (EIO);
595 }
596 } else {
597 vdev->v_name = strdup(type);
598 }
599 }
600 } else {
601 is_new = 0;
602 }
603
604 if (is_new || is_newer) {
605 /*
606 * This is either new vdev or we've already seen this vdev,
607 * but from an older vdev label, so let's refresh its state
608 * from the newer label.
609 */
610 if (is_offline)
611 vdev->v_state = VDEV_STATE_OFFLINE;
612 else if (is_removed)
613 vdev->v_state = VDEV_STATE_REMOVED;
614 else if (is_faulted)
615 vdev->v_state = VDEV_STATE_FAULTED;
616 else if (is_degraded)
617 vdev->v_state = VDEV_STATE_DEGRADED;
618 else if (isnt_present)
619 vdev->v_state = VDEV_STATE_CANT_OPEN;
620 }
621
622 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
623 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
624 /*
625 * Its ok if we don't have any kids.
626 */
627 if (rc == 0) {
628 vdev->v_nchildren = nkids;
629 for (i = 0; i < nkids; i++) {
630 rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
631 if (rc)
632 return (rc);
633 if (is_new)
634 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
635 v_childlink);
636 kids = nvlist_next(kids);
637 }
638 } else {
639 vdev->v_nchildren = 0;
640 }
641
642 if (vdevp)
643 *vdevp = vdev;
644 return (0);
645 }
646
647 static void
648 vdev_set_state(vdev_t *vdev)
649 {
650 vdev_t *kid;
651 int good_kids;
652 int bad_kids;
653
654 /*
655 * A mirror or raidz is healthy if all its kids are healthy. A
656 * mirror is degraded if any of its kids is healthy; a raidz
657 * is degraded if at most nparity kids are offline.
658 */
659 if (STAILQ_FIRST(&vdev->v_children)) {
660 good_kids = 0;
661 bad_kids = 0;
662 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
663 if (kid->v_state == VDEV_STATE_HEALTHY)
664 good_kids++;
665 else
666 bad_kids++;
667 }
668 if (bad_kids == 0) {
669 vdev->v_state = VDEV_STATE_HEALTHY;
670 } else {
671 if (vdev->v_read == vdev_mirror_read) {
672 if (good_kids) {
673 vdev->v_state = VDEV_STATE_DEGRADED;
674 } else {
675 vdev->v_state = VDEV_STATE_OFFLINE;
676 }
677 } else if (vdev->v_read == vdev_raidz_read) {
678 if (bad_kids > vdev->v_nparity) {
679 vdev->v_state = VDEV_STATE_OFFLINE;
680 } else {
681 vdev->v_state = VDEV_STATE_DEGRADED;
682 }
683 }
684 }
685 }
686 }
687
688 static spa_t *
689 spa_find_by_guid(uint64_t guid)
690 {
691 spa_t *spa;
692
693 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
694 if (spa->spa_guid == guid)
695 return (spa);
696
697 return (0);
698 }
699
700 static spa_t *
701 spa_find_by_name(const char *name)
702 {
703 spa_t *spa;
704
705 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
706 if (!strcmp(spa->spa_name, name))
707 return (spa);
708
709 return (0);
710 }
711
712 #ifdef BOOT2
713 static spa_t *
714 spa_get_primary(void)
715 {
716
717 return (STAILQ_FIRST(&zfs_pools));
718 }
719
720 static vdev_t *
721 spa_get_primary_vdev(const spa_t *spa)
722 {
723 vdev_t *vdev;
724 vdev_t *kid;
725
726 if (spa == NULL)
727 spa = spa_get_primary();
728 if (spa == NULL)
729 return (NULL);
730 vdev = STAILQ_FIRST(&spa->spa_vdevs);
731 if (vdev == NULL)
732 return (NULL);
733 for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
734 kid = STAILQ_FIRST(&vdev->v_children))
735 vdev = kid;
736 return (vdev);
737 }
738 #endif
739
740 static spa_t *
741 spa_create(uint64_t guid)
742 {
743 spa_t *spa;
744
745 spa = malloc(sizeof(spa_t));
746 memset(spa, 0, sizeof(spa_t));
747 STAILQ_INIT(&spa->spa_vdevs);
748 spa->spa_guid = guid;
749 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
750
751 return (spa);
752 }
753
754 static const char *
755 state_name(vdev_state_t state)
756 {
757 static const char* names[] = {
758 "UNKNOWN",
759 "CLOSED",
760 "OFFLINE",
761 "REMOVED",
762 "CANT_OPEN",
763 "FAULTED",
764 "DEGRADED",
765 "ONLINE"
766 };
767 return names[state];
768 }
769
770 #ifdef BOOT2
771
772 #define pager_printf printf
773
774 #else
775
776 static void
777 pager_printf(const char *fmt, ...)
778 {
779 char line[80];
780 va_list args;
781
782 va_start(args, fmt);
783 vsprintf(line, fmt, args);
784 va_end(args);
785 pager_output(line);
786 }
787
788 #endif
789
790 #define STATUS_FORMAT " %s %s\n"
791
792 static void
793 print_state(int indent, const char *name, vdev_state_t state)
794 {
795 int i;
796 char buf[512];
797
798 buf[0] = 0;
799 for (i = 0; i < indent; i++)
800 strcat(buf, " ");
801 strcat(buf, name);
802 pager_printf(STATUS_FORMAT, buf, state_name(state));
803
804 }
805
806 static void
807 vdev_status(vdev_t *vdev, int indent)
808 {
809 vdev_t *kid;
810 print_state(indent, vdev->v_name, vdev->v_state);
811
812 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
813 vdev_status(kid, indent + 1);
814 }
815 }
816
817 static void
818 spa_status(spa_t *spa)
819 {
820 static char bootfs[ZFS_MAXNAMELEN];
821 uint64_t rootid;
822 vdev_t *vdev;
823 int good_kids, bad_kids, degraded_kids;
824 vdev_state_t state;
825
826 pager_printf(" pool: %s\n", spa->spa_name);
827 if (zfs_get_root(spa, &rootid) == 0 &&
828 zfs_rlookup(spa, rootid, bootfs) == 0) {
829 if (bootfs[0] == '\0')
830 pager_printf("bootfs: %s\n", spa->spa_name);
831 else
832 pager_printf("bootfs: %s/%s\n", spa->spa_name, bootfs);
833 }
834 pager_printf("config:\n\n");
835 pager_printf(STATUS_FORMAT, "NAME", "STATE");
836
837 good_kids = 0;
838 degraded_kids = 0;
839 bad_kids = 0;
840 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
841 if (vdev->v_state == VDEV_STATE_HEALTHY)
842 good_kids++;
843 else if (vdev->v_state == VDEV_STATE_DEGRADED)
844 degraded_kids++;
845 else
846 bad_kids++;
847 }
848
849 state = VDEV_STATE_CLOSED;
850 if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
851 state = VDEV_STATE_HEALTHY;
852 else if ((good_kids + degraded_kids) > 0)
853 state = VDEV_STATE_DEGRADED;
854
855 print_state(0, spa->spa_name, state);
856 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
857 vdev_status(vdev, 1);
858 }
859 }
860
861 static void
862 spa_all_status(void)
863 {
864 spa_t *spa;
865 int first = 1;
866
867 STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
868 if (!first)
869 pager_printf("\n");
870 first = 0;
871 spa_status(spa);
872 }
873 }
874
875 static int
876 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
877 {
878 vdev_t vtmp;
879 vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
880 spa_t *spa;
881 vdev_t *vdev, *top_vdev, *pool_vdev;
882 off_t off;
883 blkptr_t bp;
884 const unsigned char *nvlist;
885 uint64_t val;
886 uint64_t guid;
887 uint64_t pool_txg, pool_guid;
888 uint64_t is_log;
889 const char *pool_name;
890 const unsigned char *vdevs;
891 const unsigned char *features;
892 int i, rc, is_newer;
893 char *upbuf;
894 const struct uberblock *up;
895
896 /*
897 * Load the vdev label and figure out which
898 * uberblock is most current.
899 */
900 memset(&vtmp, 0, sizeof(vtmp));
901 vtmp.v_phys_read = read;
902 vtmp.v_read_priv = read_priv;
903 off = offsetof(vdev_label_t, vl_vdev_phys);
904 BP_ZERO(&bp);
905 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
906 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
907 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
908 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
909 DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
910 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
911 if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
912 return (EIO);
913
914 if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
915 return (EIO);
916 }
917
918 nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
919
920 if (nvlist_find(nvlist,
921 ZPOOL_CONFIG_VERSION,
922 DATA_TYPE_UINT64, 0, &val)) {
923 return (EIO);
924 }
925
926 if (!SPA_VERSION_IS_SUPPORTED(val)) {
927 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
928 (unsigned) val, (unsigned) SPA_VERSION);
929 return (EIO);
930 }
931
932 /* Check ZFS features for read */
933 if (nvlist_find(nvlist,
934 ZPOOL_CONFIG_FEATURES_FOR_READ,
935 DATA_TYPE_NVLIST, 0, &features) == 0
936 && nvlist_check_features_for_read(features) != 0)
937 return (EIO);
938
939 if (nvlist_find(nvlist,
940 ZPOOL_CONFIG_POOL_STATE,
941 DATA_TYPE_UINT64, 0, &val)) {
942 return (EIO);
943 }
944
945 if (val == POOL_STATE_DESTROYED) {
946 /* We don't boot only from destroyed pools. */
947 return (EIO);
948 }
949
950 if (nvlist_find(nvlist,
951 ZPOOL_CONFIG_POOL_TXG,
952 DATA_TYPE_UINT64, 0, &pool_txg)
953 || nvlist_find(nvlist,
954 ZPOOL_CONFIG_POOL_GUID,
955 DATA_TYPE_UINT64, 0, &pool_guid)
956 || nvlist_find(nvlist,
957 ZPOOL_CONFIG_POOL_NAME,
958 DATA_TYPE_STRING, 0, &pool_name)) {
959 /*
960 * Cache and spare devices end up here - just ignore
961 * them.
962 */
963 /*printf("ZFS: can't find pool details\n");*/
964 return (EIO);
965 }
966
967 is_log = 0;
968 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
969 &is_log);
970 if (is_log)
971 return (EIO);
972
973 /*
974 * Create the pool if this is the first time we've seen it.
975 */
976 spa = spa_find_by_guid(pool_guid);
977 if (!spa) {
978 spa = spa_create(pool_guid);
979 spa->spa_name = strdup(pool_name);
980 }
981 if (pool_txg > spa->spa_txg) {
982 spa->spa_txg = pool_txg;
983 is_newer = 1;
984 } else
985 is_newer = 0;
986
987 /*
988 * Get the vdev tree and create our in-core copy of it.
989 * If we already have a vdev with this guid, this must
990 * be some kind of alias (overlapping slices, dangerously dedicated
991 * disks etc).
992 */
993 if (nvlist_find(nvlist,
994 ZPOOL_CONFIG_GUID,
995 DATA_TYPE_UINT64, 0, &guid)) {
996 return (EIO);
997 }
998 vdev = vdev_find(guid);
999 if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
1000 return (EIO);
1001
1002 if (nvlist_find(nvlist,
1003 ZPOOL_CONFIG_VDEV_TREE,
1004 DATA_TYPE_NVLIST, 0, &vdevs)) {
1005 return (EIO);
1006 }
1007
1008 rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1009 if (rc)
1010 return (rc);
1011
1012 /*
1013 * Add the toplevel vdev to the pool if its not already there.
1014 */
1015 STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1016 if (top_vdev == pool_vdev)
1017 break;
1018 if (!pool_vdev && top_vdev)
1019 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1020
1021 /*
1022 * We should already have created an incomplete vdev for this
1023 * vdev. Find it and initialise it with our read proc.
1024 */
1025 vdev = vdev_find(guid);
1026 if (vdev) {
1027 vdev->v_phys_read = read;
1028 vdev->v_read_priv = read_priv;
1029 vdev->v_state = VDEV_STATE_HEALTHY;
1030 } else {
1031 printf("ZFS: inconsistent nvlist contents\n");
1032 return (EIO);
1033 }
1034
1035 /*
1036 * Re-evaluate top-level vdev state.
1037 */
1038 vdev_set_state(top_vdev);
1039
1040 /*
1041 * Ok, we are happy with the pool so far. Lets find
1042 * the best uberblock and then we can actually access
1043 * the contents of the pool.
1044 */
1045 upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1046 up = (const struct uberblock *)upbuf;
1047 for (i = 0;
1048 i < VDEV_UBERBLOCK_COUNT(vdev);
1049 i++) {
1050 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
1051 BP_ZERO(&bp);
1052 DVA_SET_OFFSET(&bp.blk_dva[0], off);
1053 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1054 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1055 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1056 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1057 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1058
1059 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1060 continue;
1061
1062 if (up->ub_magic != UBERBLOCK_MAGIC)
1063 continue;
1064 if (up->ub_txg < spa->spa_txg)
1065 continue;
1066 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
1067 spa->spa_uberblock = *up;
1068 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
1069 if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
1070 spa->spa_uberblock = *up;
1071 }
1072 }
1073 zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1074
1075 if (spap)
1076 *spap = spa;
1077 return (0);
1078 }
1079
1080 static int
1081 ilog2(int n)
1082 {
1083 int v;
1084
1085 for (v = 0; v < 32; v++)
1086 if (n == (1 << v))
1087 return v;
1088 return -1;
1089 }
1090
1091 static int
1092 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1093 {
1094 blkptr_t gbh_bp;
1095 zio_gbh_phys_t zio_gb;
1096 char *pbuf;
1097 int i;
1098
1099 /* Artificial BP for gang block header. */
1100 gbh_bp = *bp;
1101 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1102 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1103 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1104 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1105 for (i = 0; i < SPA_DVAS_PER_BP; i++)
1106 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1107
1108 /* Read gang header block using the artificial BP. */
1109 if (zio_read(spa, &gbh_bp, &zio_gb))
1110 return (EIO);
1111
1112 pbuf = buf;
1113 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1114 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1115
1116 if (BP_IS_HOLE(gbp))
1117 continue;
1118 if (zio_read(spa, gbp, pbuf))
1119 return (EIO);
1120 pbuf += BP_GET_PSIZE(gbp);
1121 }
1122
1123 if (zio_checksum_verify(bp, buf))
1124 return (EIO);
1125 return (0);
1126 }
1127
1128 static int
1129 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1130 {
1131 int cpfunc = BP_GET_COMPRESS(bp);
1132 uint64_t align, size;
1133 void *pbuf;
1134 int i, error;
1135
1136 error = EIO;
1137
1138 for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1139 const dva_t *dva = &bp->blk_dva[i];
1140 vdev_t *vdev;
1141 int vdevid;
1142 off_t offset;
1143
1144 if (!dva->dva_word[0] && !dva->dva_word[1])
1145 continue;
1146
1147 vdevid = DVA_GET_VDEV(dva);
1148 offset = DVA_GET_OFFSET(dva);
1149 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1150 if (vdev->v_id == vdevid)
1151 break;
1152 }
1153 if (!vdev || !vdev->v_read)
1154 continue;
1155
1156 size = BP_GET_PSIZE(bp);
1157 if (vdev->v_read == vdev_raidz_read) {
1158 align = 1ULL << vdev->v_top->v_ashift;
1159 if (P2PHASE(size, align) != 0)
1160 size = P2ROUNDUP(size, align);
1161 }
1162 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1163 pbuf = zfs_alloc(size);
1164 else
1165 pbuf = buf;
1166
1167 if (DVA_GET_GANG(dva))
1168 error = zio_read_gang(spa, bp, pbuf);
1169 else
1170 error = vdev->v_read(vdev, bp, pbuf, offset, size);
1171 if (error == 0) {
1172 if (cpfunc != ZIO_COMPRESS_OFF)
1173 error = zio_decompress_data(cpfunc, pbuf,
1174 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1175 else if (size != BP_GET_PSIZE(bp))
1176 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1177 }
1178 if (buf != pbuf)
1179 zfs_free(pbuf, size);
1180 if (error == 0)
1181 break;
1182 }
1183 if (error != 0)
1184 printf("ZFS: i/o error - all block copies unavailable\n");
1185 return (error);
1186 }
1187
1188 static int
1189 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1190 {
1191 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1192 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1193 int nlevels = dnode->dn_nlevels;
1194 int i, rc;
1195
1196 /*
1197 * Note: bsize may not be a power of two here so we need to do an
1198 * actual divide rather than a bitshift.
1199 */
1200 while (buflen > 0) {
1201 uint64_t bn = offset / bsize;
1202 int boff = offset % bsize;
1203 int ibn;
1204 const blkptr_t *indbp;
1205 blkptr_t bp;
1206
1207 if (bn > dnode->dn_maxblkid)
1208 return (EIO);
1209
1210 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1211 goto cached;
1212
1213 indbp = dnode->dn_blkptr;
1214 for (i = 0; i < nlevels; i++) {
1215 /*
1216 * Copy the bp from the indirect array so that
1217 * we can re-use the scratch buffer for multi-level
1218 * objects.
1219 */
1220 ibn = bn >> ((nlevels - i - 1) * ibshift);
1221 ibn &= ((1 << ibshift) - 1);
1222 bp = indbp[ibn];
1223 rc = zio_read(spa, &bp, dnode_cache_buf);
1224 if (rc)
1225 return (rc);
1226 indbp = (const blkptr_t *) dnode_cache_buf;
1227 }
1228 dnode_cache_obj = dnode;
1229 dnode_cache_bn = bn;
1230 cached:
1231
1232 /*
1233 * The buffer contains our data block. Copy what we
1234 * need from it and loop.
1235 */
1236 i = bsize - boff;
1237 if (i > buflen) i = buflen;
1238 memcpy(buf, &dnode_cache_buf[boff], i);
1239 buf = ((char*) buf) + i;
1240 offset += i;
1241 buflen -= i;
1242 }
1243
1244 return (0);
1245 }
1246
1247 /*
1248 * Lookup a value in a microzap directory. Assumes that the zap
1249 * scratch buffer contains the directory contents.
1250 */
1251 static int
1252 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1253 {
1254 const mzap_phys_t *mz;
1255 const mzap_ent_phys_t *mze;
1256 size_t size;
1257 int chunks, i;
1258
1259 /*
1260 * Microzap objects use exactly one block. Read the whole
1261 * thing.
1262 */
1263 size = dnode->dn_datablkszsec * 512;
1264
1265 mz = (const mzap_phys_t *) zap_scratch;
1266 chunks = size / MZAP_ENT_LEN - 1;
1267
1268 for (i = 0; i < chunks; i++) {
1269 mze = &mz->mz_chunk[i];
1270 if (!strcmp(mze->mze_name, name)) {
1271 *value = mze->mze_value;
1272 return (0);
1273 }
1274 }
1275
1276 return (ENOENT);
1277 }
1278
1279 /*
1280 * Compare a name with a zap leaf entry. Return non-zero if the name
1281 * matches.
1282 */
1283 static int
1284 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1285 {
1286 size_t namelen;
1287 const zap_leaf_chunk_t *nc;
1288 const char *p;
1289
1290 namelen = zc->l_entry.le_name_numints;
1291
1292 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1293 p = name;
1294 while (namelen > 0) {
1295 size_t len;
1296 len = namelen;
1297 if (len > ZAP_LEAF_ARRAY_BYTES)
1298 len = ZAP_LEAF_ARRAY_BYTES;
1299 if (memcmp(p, nc->l_array.la_array, len))
1300 return (0);
1301 p += len;
1302 namelen -= len;
1303 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1304 }
1305
1306 return 1;
1307 }
1308
1309 /*
1310 * Extract a uint64_t value from a zap leaf entry.
1311 */
1312 static uint64_t
1313 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1314 {
1315 const zap_leaf_chunk_t *vc;
1316 int i;
1317 uint64_t value;
1318 const uint8_t *p;
1319
1320 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1321 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1322 value = (value << 8) | p[i];
1323 }
1324
1325 return value;
1326 }
1327
1328 /*
1329 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1330 * buffer contains the directory header.
1331 */
1332 static int
1333 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1334 {
1335 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1336 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1337 fat_zap_t z;
1338 uint64_t *ptrtbl;
1339 uint64_t hash;
1340 int rc;
1341
1342 if (zh.zap_magic != ZAP_MAGIC)
1343 return (EIO);
1344
1345 z.zap_block_shift = ilog2(bsize);
1346 z.zap_phys = (zap_phys_t *) zap_scratch;
1347
1348 /*
1349 * Figure out where the pointer table is and read it in if necessary.
1350 */
1351 if (zh.zap_ptrtbl.zt_blk) {
1352 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1353 zap_scratch, bsize);
1354 if (rc)
1355 return (rc);
1356 ptrtbl = (uint64_t *) zap_scratch;
1357 } else {
1358 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1359 }
1360
1361 hash = zap_hash(zh.zap_salt, name);
1362
1363 zap_leaf_t zl;
1364 zl.l_bs = z.zap_block_shift;
1365
1366 off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1367 zap_leaf_chunk_t *zc;
1368
1369 rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1370 if (rc)
1371 return (rc);
1372
1373 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1374
1375 /*
1376 * Make sure this chunk matches our hash.
1377 */
1378 if (zl.l_phys->l_hdr.lh_prefix_len > 0
1379 && zl.l_phys->l_hdr.lh_prefix
1380 != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1381 return (ENOENT);
1382
1383 /*
1384 * Hash within the chunk to find our entry.
1385 */
1386 int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1387 int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1388 h = zl.l_phys->l_hash[h];
1389 if (h == 0xffff)
1390 return (ENOENT);
1391 zc = &ZAP_LEAF_CHUNK(&zl, h);
1392 while (zc->l_entry.le_hash != hash) {
1393 if (zc->l_entry.le_next == 0xffff) {
1394 zc = 0;
1395 break;
1396 }
1397 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1398 }
1399 if (fzap_name_equal(&zl, zc, name)) {
1400 if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints > 8)
1401 return (E2BIG);
1402 *value = fzap_leaf_value(&zl, zc);
1403 return (0);
1404 }
1405
1406 return (ENOENT);
1407 }
1408
1409 /*
1410 * Lookup a name in a zap object and return its value as a uint64_t.
1411 */
1412 static int
1413 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1414 {
1415 int rc;
1416 uint64_t zap_type;
1417 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1418
1419 rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1420 if (rc)
1421 return (rc);
1422
1423 zap_type = *(uint64_t *) zap_scratch;
1424 if (zap_type == ZBT_MICRO)
1425 return mzap_lookup(dnode, name, value);
1426 else if (zap_type == ZBT_HEADER)
1427 return fzap_lookup(spa, dnode, name, value);
1428 printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1429 return (EIO);
1430 }
1431
1432 /*
1433 * List a microzap directory. Assumes that the zap scratch buffer contains
1434 * the directory contents.
1435 */
1436 static int
1437 mzap_list(const dnode_phys_t *dnode)
1438 {
1439 const mzap_phys_t *mz;
1440 const mzap_ent_phys_t *mze;
1441 size_t size;
1442 int chunks, i;
1443
1444 /*
1445 * Microzap objects use exactly one block. Read the whole
1446 * thing.
1447 */
1448 size = dnode->dn_datablkszsec * 512;
1449 mz = (const mzap_phys_t *) zap_scratch;
1450 chunks = size / MZAP_ENT_LEN - 1;
1451
1452 for (i = 0; i < chunks; i++) {
1453 mze = &mz->mz_chunk[i];
1454 if (mze->mze_name[0])
1455 //printf("%-32s 0x%jx\n", mze->mze_name, (uintmax_t)mze->mze_value);
1456 printf("%s\n", mze->mze_name);
1457 }
1458
1459 return (0);
1460 }
1461
1462 /*
1463 * List a fatzap directory. Assumes that the zap scratch buffer contains
1464 * the directory header.
1465 */
1466 static int
1467 fzap_list(const spa_t *spa, const dnode_phys_t *dnode)
1468 {
1469 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1470 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1471 fat_zap_t z;
1472 int i, j;
1473
1474 if (zh.zap_magic != ZAP_MAGIC)
1475 return (EIO);
1476
1477 z.zap_block_shift = ilog2(bsize);
1478 z.zap_phys = (zap_phys_t *) zap_scratch;
1479
1480 /*
1481 * This assumes that the leaf blocks start at block 1. The
1482 * documentation isn't exactly clear on this.
1483 */
1484 zap_leaf_t zl;
1485 zl.l_bs = z.zap_block_shift;
1486 for (i = 0; i < zh.zap_num_leafs; i++) {
1487 off_t off = (i + 1) << zl.l_bs;
1488 char name[256], *p;
1489 uint64_t value;
1490
1491 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1492 return (EIO);
1493
1494 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1495
1496 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1497 zap_leaf_chunk_t *zc, *nc;
1498 int namelen;
1499
1500 zc = &ZAP_LEAF_CHUNK(&zl, j);
1501 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1502 continue;
1503 namelen = zc->l_entry.le_name_numints;
1504 if (namelen > sizeof(name))
1505 namelen = sizeof(name);
1506
1507 /*
1508 * Paste the name back together.
1509 */
1510 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1511 p = name;
1512 while (namelen > 0) {
1513 int len;
1514 len = namelen;
1515 if (len > ZAP_LEAF_ARRAY_BYTES)
1516 len = ZAP_LEAF_ARRAY_BYTES;
1517 memcpy(p, nc->l_array.la_array, len);
1518 p += len;
1519 namelen -= len;
1520 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1521 }
1522
1523 /*
1524 * Assume the first eight bytes of the value are
1525 * a uint64_t.
1526 */
1527 value = fzap_leaf_value(&zl, zc);
1528
1529 //printf("%s 0x%jx\n", name, (uintmax_t)value);
1530 printf("%s\n", name);
1531 }
1532 }
1533
1534 return (0);
1535 }
1536
1537 /*
1538 * List a zap directory.
1539 */
1540 static int
1541 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1542 {
1543 uint64_t zap_type;
1544 size_t size = dnode->dn_datablkszsec * 512;
1545
1546 if (dnode_read(spa, dnode, 0, zap_scratch, size))
1547 return (EIO);
1548
1549 zap_type = *(uint64_t *) zap_scratch;
1550 if (zap_type == ZBT_MICRO)
1551 return mzap_list(dnode);
1552 else
1553 return fzap_list(spa, dnode);
1554 }
1555
1556 static int
1557 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1558 {
1559 off_t offset;
1560
1561 offset = objnum * sizeof(dnode_phys_t);
1562 return dnode_read(spa, &os->os_meta_dnode, offset,
1563 dnode, sizeof(dnode_phys_t));
1564 }
1565
1566 static int
1567 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1568 {
1569 const mzap_phys_t *mz;
1570 const mzap_ent_phys_t *mze;
1571 size_t size;
1572 int chunks, i;
1573
1574 /*
1575 * Microzap objects use exactly one block. Read the whole
1576 * thing.
1577 */
1578 size = dnode->dn_datablkszsec * 512;
1579
1580 mz = (const mzap_phys_t *) zap_scratch;
1581 chunks = size / MZAP_ENT_LEN - 1;
1582
1583 for (i = 0; i < chunks; i++) {
1584 mze = &mz->mz_chunk[i];
1585 if (value == mze->mze_value) {
1586 strcpy(name, mze->mze_name);
1587 return (0);
1588 }
1589 }
1590
1591 return (ENOENT);
1592 }
1593
1594 static void
1595 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1596 {
1597 size_t namelen;
1598 const zap_leaf_chunk_t *nc;
1599 char *p;
1600
1601 namelen = zc->l_entry.le_name_numints;
1602
1603 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1604 p = name;
1605 while (namelen > 0) {
1606 size_t len;
1607 len = namelen;
1608 if (len > ZAP_LEAF_ARRAY_BYTES)
1609 len = ZAP_LEAF_ARRAY_BYTES;
1610 memcpy(p, nc->l_array.la_array, len);
1611 p += len;
1612 namelen -= len;
1613 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1614 }
1615
1616 *p = '\0';
1617 }
1618
1619 static int
1620 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1621 {
1622 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1623 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1624 fat_zap_t z;
1625 int i, j;
1626
1627 if (zh.zap_magic != ZAP_MAGIC)
1628 return (EIO);
1629
1630 z.zap_block_shift = ilog2(bsize);
1631 z.zap_phys = (zap_phys_t *) zap_scratch;
1632
1633 /*
1634 * This assumes that the leaf blocks start at block 1. The
1635 * documentation isn't exactly clear on this.
1636 */
1637 zap_leaf_t zl;
1638 zl.l_bs = z.zap_block_shift;
1639 for (i = 0; i < zh.zap_num_leafs; i++) {
1640 off_t off = (i + 1) << zl.l_bs;
1641
1642 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1643 return (EIO);
1644
1645 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1646
1647 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1648 zap_leaf_chunk_t *zc;
1649
1650 zc = &ZAP_LEAF_CHUNK(&zl, j);
1651 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1652 continue;
1653 if (zc->l_entry.le_value_intlen != 8 ||
1654 zc->l_entry.le_value_numints != 1)
1655 continue;
1656
1657 if (fzap_leaf_value(&zl, zc) == value) {
1658 fzap_name_copy(&zl, zc, name);
1659 return (0);
1660 }
1661 }
1662 }
1663
1664 return (ENOENT);
1665 }
1666
1667 static int
1668 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1669 {
1670 int rc;
1671 uint64_t zap_type;
1672 size_t size = dnode->dn_datablkszsec * 512;
1673
1674 rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1675 if (rc)
1676 return (rc);
1677
1678 zap_type = *(uint64_t *) zap_scratch;
1679 if (zap_type == ZBT_MICRO)
1680 return mzap_rlookup(spa, dnode, name, value);
1681 else
1682 return fzap_rlookup(spa, dnode, name, value);
1683 }
1684
1685 static int
1686 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1687 {
1688 char name[256];
1689 char component[256];
1690 uint64_t dir_obj, parent_obj, child_dir_zapobj;
1691 dnode_phys_t child_dir_zap, dataset, dir, parent;
1692 dsl_dir_phys_t *dd;
1693 dsl_dataset_phys_t *ds;
1694 char *p;
1695 int len;
1696
1697 p = &name[sizeof(name) - 1];
1698 *p = '\0';
1699
1700 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1701 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1702 return (EIO);
1703 }
1704 ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1705 dir_obj = ds->ds_dir_obj;
1706
1707 for (;;) {
1708 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1709 return (EIO);
1710 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1711
1712 /* Actual loop condition. */
1713 parent_obj = dd->dd_parent_obj;
1714 if (parent_obj == 0)
1715 break;
1716
1717 if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1718 return (EIO);
1719 dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1720 child_dir_zapobj = dd->dd_child_dir_zapobj;
1721 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1722 return (EIO);
1723 if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1724 return (EIO);
1725
1726 len = strlen(component);
1727 p -= len;
1728 memcpy(p, component, len);
1729 --p;
1730 *p = '/';
1731
1732 /* Actual loop iteration. */
1733 dir_obj = parent_obj;
1734 }
1735
1736 if (*p != '\0')
1737 ++p;
1738 strcpy(result, p);
1739
1740 return (0);
1741 }
1742
1743 static int
1744 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1745 {
1746 char element[256];
1747 uint64_t dir_obj, child_dir_zapobj;
1748 dnode_phys_t child_dir_zap, dir;
1749 dsl_dir_phys_t *dd;
1750 const char *p, *q;
1751
1752 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1753 return (EIO);
1754 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &dir_obj))
1755 return (EIO);
1756
1757 p = name;
1758 for (;;) {
1759 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1760 return (EIO);
1761 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1762
1763 while (*p == '/')
1764 p++;
1765 /* Actual loop condition #1. */
1766 if (*p == '\0')
1767 break;
1768
1769 q = strchr(p, '/');
1770 if (q) {
1771 memcpy(element, p, q - p);
1772 element[q - p] = '\0';
1773 p = q + 1;
1774 } else {
1775 strcpy(element, p);
1776 p += strlen(p);
1777 }
1778
1779 child_dir_zapobj = dd->dd_child_dir_zapobj;
1780 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1781 return (EIO);
1782
1783 /* Actual loop condition #2. */
1784 if (zap_lookup(spa, &child_dir_zap, element, &dir_obj) != 0)
1785 return (ENOENT);
1786 }
1787
1788 *objnum = dd->dd_head_dataset_obj;
1789 return (0);
1790 }
1791
1792 #ifndef BOOT2
1793 static int
1794 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1795 {
1796 uint64_t dir_obj, child_dir_zapobj;
1797 dnode_phys_t child_dir_zap, dir, dataset;
1798 dsl_dataset_phys_t *ds;
1799 dsl_dir_phys_t *dd;
1800
1801 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1802 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1803 return (EIO);
1804 }
1805 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1806 dir_obj = ds->ds_dir_obj;
1807
1808 if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
1809 printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
1810 return (EIO);
1811 }
1812 dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1813
1814 child_dir_zapobj = dd->dd_child_dir_zapobj;
1815 if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
1816 printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
1817 return (EIO);
1818 }
1819
1820 return (zap_list(spa, &child_dir_zap) != 0);
1821 }
1822 #endif
1823
1824 /*
1825 * Find the object set given the object number of its dataset object
1826 * and return its details in *objset
1827 */
1828 static int
1829 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1830 {
1831 dnode_phys_t dataset;
1832 dsl_dataset_phys_t *ds;
1833
1834 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1835 printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1836 return (EIO);
1837 }
1838
1839 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1840 if (zio_read(spa, &ds->ds_bp, objset)) {
1841 printf("ZFS: can't read object set for dataset %ju\n",
1842 (uintmax_t)objnum);
1843 return (EIO);
1844 }
1845
1846 return (0);
1847 }
1848
1849 /*
1850 * Find the object set pointed to by the BOOTFS property or the root
1851 * dataset if there is none and return its details in *objset
1852 */
1853 static int
1854 zfs_get_root(const spa_t *spa, uint64_t *objid)
1855 {
1856 dnode_phys_t dir, propdir;
1857 uint64_t props, bootfs, root;
1858
1859 *objid = 0;
1860
1861 /*
1862 * Start with the MOS directory object.
1863 */
1864 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1865 printf("ZFS: can't read MOS object directory\n");
1866 return (EIO);
1867 }
1868
1869 /*
1870 * Lookup the pool_props and see if we can find a bootfs.
1871 */
1872 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1873 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1874 && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1875 && bootfs != 0)
1876 {
1877 *objid = bootfs;
1878 return (0);
1879 }
1880 /*
1881 * Lookup the root dataset directory
1882 */
1883 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1884 || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1885 printf("ZFS: can't find root dsl_dir\n");
1886 return (EIO);
1887 }
1888
1889 /*
1890 * Use the information from the dataset directory's bonus buffer
1891 * to find the dataset object and from that the object set itself.
1892 */
1893 dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1894 *objid = dd->dd_head_dataset_obj;
1895 return (0);
1896 }
1897
1898 static int
1899 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
1900 {
1901
1902 mount->spa = spa;
1903
1904 /*
1905 * Find the root object set if not explicitly provided
1906 */
1907 if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
1908 printf("ZFS: can't find root filesystem\n");
1909 return (EIO);
1910 }
1911
1912 if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
1913 printf("ZFS: can't open root filesystem\n");
1914 return (EIO);
1915 }
1916
1917 mount->rootobj = rootobj;
1918
1919 return (0);
1920 }
1921
1922 static int
1923 zfs_spa_init(spa_t *spa)
1924 {
1925
1926 if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1927 printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
1928 return (EIO);
1929 }
1930 if (spa->spa_mos.os_type != DMU_OST_META) {
1931 printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
1932 return (EIO);
1933 }
1934 return (0);
1935 }
1936
1937 static int
1938 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
1939 {
1940
1941 if (dn->dn_bonustype != DMU_OT_SA) {
1942 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
1943
1944 sb->st_mode = zp->zp_mode;
1945 sb->st_uid = zp->zp_uid;
1946 sb->st_gid = zp->zp_gid;
1947 sb->st_size = zp->zp_size;
1948 } else {
1949 sa_hdr_phys_t *sahdrp;
1950 int hdrsize;
1951 size_t size = 0;
1952 void *buf = NULL;
1953
1954 if (dn->dn_bonuslen != 0)
1955 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
1956 else {
1957 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
1958 blkptr_t *bp = &dn->dn_spill;
1959 int error;
1960
1961 size = BP_GET_LSIZE(bp);
1962 buf = zfs_alloc(size);
1963 error = zio_read(spa, bp, buf);
1964 if (error != 0) {
1965 zfs_free(buf, size);
1966 return (error);
1967 }
1968 sahdrp = buf;
1969 } else {
1970 return (EIO);
1971 }
1972 }
1973 hdrsize = SA_HDR_SIZE(sahdrp);
1974 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
1975 SA_MODE_OFFSET);
1976 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
1977 SA_UID_OFFSET);
1978 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
1979 SA_GID_OFFSET);
1980 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
1981 SA_SIZE_OFFSET);
1982 if (buf != NULL)
1983 zfs_free(buf, size);
1984 }
1985
1986 return (0);
1987 }
1988
1989 /*
1990 * Lookup a file and return its dnode.
1991 */
1992 static int
1993 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
1994 {
1995 int rc;
1996 uint64_t objnum, rootnum, parentnum;
1997 const spa_t *spa;
1998 dnode_phys_t dn;
1999 const char *p, *q;
2000 char element[256];
2001 char path[1024];
2002 int symlinks_followed = 0;
2003 struct stat sb;
2004
2005 spa = mount->spa;
2006 if (mount->objset.os_type != DMU_OST_ZFS) {
2007 printf("ZFS: unexpected object set type %ju\n",
2008 (uintmax_t)mount->objset.os_type);
2009 return (EIO);
2010 }
2011
2012 /*
2013 * Get the root directory dnode.
2014 */
2015 rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2016 if (rc)
2017 return (rc);
2018
2019 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
2020 if (rc)
2021 return (rc);
2022
2023 rc = objset_get_dnode(spa, &mount->objset, rootnum, &dn);
2024 if (rc)
2025 return (rc);
2026
2027 objnum = rootnum;
2028 p = upath;
2029 while (p && *p) {
2030 while (*p == '/')
2031 p++;
2032 if (!*p)
2033 break;
2034 q = strchr(p, '/');
2035 if (q) {
2036 memcpy(element, p, q - p);
2037 element[q - p] = 0;
2038 p = q;
2039 } else {
2040 strcpy(element, p);
2041 p = 0;
2042 }
2043
2044 rc = zfs_dnode_stat(spa, &dn, &sb);
2045 if (rc)
2046 return (rc);
2047 if (!S_ISDIR(sb.st_mode))
2048 return (ENOTDIR);
2049
2050 parentnum = objnum;
2051 rc = zap_lookup(spa, &dn, element, &objnum);
2052 if (rc)
2053 return (rc);
2054 objnum = ZFS_DIRENT_OBJ(objnum);
2055
2056 rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2057 if (rc)
2058 return (rc);
2059
2060 /*
2061 * Check for symlink.
2062 */
2063 rc = zfs_dnode_stat(spa, &dn, &sb);
2064 if (rc)
2065 return (rc);
2066 if (S_ISLNK(sb.st_mode)) {
2067 if (symlinks_followed > 10)
2068 return (EMLINK);
2069 symlinks_followed++;
2070
2071 /*
2072 * Read the link value and copy the tail of our
2073 * current path onto the end.
2074 */
2075 if (p)
2076 strcpy(&path[sb.st_size], p);
2077 else
2078 path[sb.st_size] = 0;
2079 if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
2080 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
2081 sb.st_size);
2082 } else {
2083 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
2084 if (rc)
2085 return (rc);
2086 }
2087
2088 /*
2089 * Restart with the new path, starting either at
2090 * the root or at the parent depending whether or
2091 * not the link is relative.
2092 */
2093 p = path;
2094 if (*p == '/')
2095 objnum = rootnum;
2096 else
2097 objnum = parentnum;
2098 objset_get_dnode(spa, &mount->objset, objnum, &dn);
2099 }
2100 }
2101
2102 *dnode = dn;
2103 return (0);
2104 }
Cache object: c3f748475327fa16c426e8a45caff998
|