FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c
1 /*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/8.3/sys/boot/zfs/zfsimpl.c 230164 2012-01-15 21:50:17Z avg $");
29
30 /*
31 * Stand-alone ZFS file reader.
32 */
33
34 #include <sys/stat.h>
35
36 #include "zfsimpl.h"
37 #include "zfssubr.c"
38
39 /*
40 * List of all vdevs, chained through v_alllink.
41 */
42 static vdev_list_t zfs_vdevs;
43
44 /*
45 * List of all pools, chained through spa_link.
46 */
47 static spa_list_t zfs_pools;
48
49 static uint64_t zfs_crc64_table[256];
50 static const dnode_phys_t *dnode_cache_obj = 0;
51 static uint64_t dnode_cache_bn;
52 static char *dnode_cache_buf;
53 static char *zap_scratch;
54 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
55
56 #define TEMP_SIZE (1024 * 1024)
57
58 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
59
60 static void
61 zfs_init(void)
62 {
63 STAILQ_INIT(&zfs_vdevs);
64 STAILQ_INIT(&zfs_pools);
65
66 zfs_temp_buf = malloc(TEMP_SIZE);
67 zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
68 zfs_temp_ptr = zfs_temp_buf;
69 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
70 zap_scratch = malloc(SPA_MAXBLOCKSIZE);
71
72 zfs_init_crc();
73 }
74
75 static void *
76 zfs_alloc(size_t size)
77 {
78 char *ptr;
79
80 if (zfs_temp_ptr + size > zfs_temp_end) {
81 printf("ZFS: out of temporary buffer space\n");
82 for (;;) ;
83 }
84 ptr = zfs_temp_ptr;
85 zfs_temp_ptr += size;
86
87 return (ptr);
88 }
89
90 static void
91 zfs_free(void *ptr, size_t size)
92 {
93
94 zfs_temp_ptr -= size;
95 if (zfs_temp_ptr != ptr) {
96 printf("ZFS: zfs_alloc()/zfs_free() mismatch\n");
97 for (;;) ;
98 }
99 }
100
101 static int
102 xdr_int(const unsigned char **xdr, int *ip)
103 {
104 *ip = ((*xdr)[0] << 24)
105 | ((*xdr)[1] << 16)
106 | ((*xdr)[2] << 8)
107 | ((*xdr)[3] << 0);
108 (*xdr) += 4;
109 return (0);
110 }
111
112 static int
113 xdr_u_int(const unsigned char **xdr, u_int *ip)
114 {
115 *ip = ((*xdr)[0] << 24)
116 | ((*xdr)[1] << 16)
117 | ((*xdr)[2] << 8)
118 | ((*xdr)[3] << 0);
119 (*xdr) += 4;
120 return (0);
121 }
122
123 static int
124 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
125 {
126 u_int hi, lo;
127
128 xdr_u_int(xdr, &hi);
129 xdr_u_int(xdr, &lo);
130 *lp = (((uint64_t) hi) << 32) | lo;
131 return (0);
132 }
133
134 static int
135 nvlist_find(const unsigned char *nvlist, const char *name, int type,
136 int* elementsp, void *valuep)
137 {
138 const unsigned char *p, *pair;
139 int junk;
140 int encoded_size, decoded_size;
141
142 p = nvlist;
143 xdr_int(&p, &junk);
144 xdr_int(&p, &junk);
145
146 pair = p;
147 xdr_int(&p, &encoded_size);
148 xdr_int(&p, &decoded_size);
149 while (encoded_size && decoded_size) {
150 int namelen, pairtype, elements;
151 const char *pairname;
152
153 xdr_int(&p, &namelen);
154 pairname = (const char*) p;
155 p += roundup(namelen, 4);
156 xdr_int(&p, &pairtype);
157
158 if (!memcmp(name, pairname, namelen) && type == pairtype) {
159 xdr_int(&p, &elements);
160 if (elementsp)
161 *elementsp = elements;
162 if (type == DATA_TYPE_UINT64) {
163 xdr_uint64_t(&p, (uint64_t *) valuep);
164 return (0);
165 } else if (type == DATA_TYPE_STRING) {
166 int len;
167 xdr_int(&p, &len);
168 (*(const char**) valuep) = (const char*) p;
169 return (0);
170 } else if (type == DATA_TYPE_NVLIST
171 || type == DATA_TYPE_NVLIST_ARRAY) {
172 (*(const unsigned char**) valuep) =
173 (const unsigned char*) p;
174 return (0);
175 } else {
176 return (EIO);
177 }
178 } else {
179 /*
180 * Not the pair we are looking for, skip to the next one.
181 */
182 p = pair + encoded_size;
183 }
184
185 pair = p;
186 xdr_int(&p, &encoded_size);
187 xdr_int(&p, &decoded_size);
188 }
189
190 return (EIO);
191 }
192
193 /*
194 * Return the next nvlist in an nvlist array.
195 */
196 static const unsigned char *
197 nvlist_next(const unsigned char *nvlist)
198 {
199 const unsigned char *p, *pair;
200 int junk;
201 int encoded_size, decoded_size;
202
203 p = nvlist;
204 xdr_int(&p, &junk);
205 xdr_int(&p, &junk);
206
207 pair = p;
208 xdr_int(&p, &encoded_size);
209 xdr_int(&p, &decoded_size);
210 while (encoded_size && decoded_size) {
211 p = pair + encoded_size;
212
213 pair = p;
214 xdr_int(&p, &encoded_size);
215 xdr_int(&p, &decoded_size);
216 }
217
218 return p;
219 }
220
221 #ifdef TEST
222
223 static const unsigned char *
224 nvlist_print(const unsigned char *nvlist, unsigned int indent)
225 {
226 static const char* typenames[] = {
227 "DATA_TYPE_UNKNOWN",
228 "DATA_TYPE_BOOLEAN",
229 "DATA_TYPE_BYTE",
230 "DATA_TYPE_INT16",
231 "DATA_TYPE_UINT16",
232 "DATA_TYPE_INT32",
233 "DATA_TYPE_UINT32",
234 "DATA_TYPE_INT64",
235 "DATA_TYPE_UINT64",
236 "DATA_TYPE_STRING",
237 "DATA_TYPE_BYTE_ARRAY",
238 "DATA_TYPE_INT16_ARRAY",
239 "DATA_TYPE_UINT16_ARRAY",
240 "DATA_TYPE_INT32_ARRAY",
241 "DATA_TYPE_UINT32_ARRAY",
242 "DATA_TYPE_INT64_ARRAY",
243 "DATA_TYPE_UINT64_ARRAY",
244 "DATA_TYPE_STRING_ARRAY",
245 "DATA_TYPE_HRTIME",
246 "DATA_TYPE_NVLIST",
247 "DATA_TYPE_NVLIST_ARRAY",
248 "DATA_TYPE_BOOLEAN_VALUE",
249 "DATA_TYPE_INT8",
250 "DATA_TYPE_UINT8",
251 "DATA_TYPE_BOOLEAN_ARRAY",
252 "DATA_TYPE_INT8_ARRAY",
253 "DATA_TYPE_UINT8_ARRAY"
254 };
255
256 unsigned int i, j;
257 const unsigned char *p, *pair;
258 int junk;
259 int encoded_size, decoded_size;
260
261 p = nvlist;
262 xdr_int(&p, &junk);
263 xdr_int(&p, &junk);
264
265 pair = p;
266 xdr_int(&p, &encoded_size);
267 xdr_int(&p, &decoded_size);
268 while (encoded_size && decoded_size) {
269 int namelen, pairtype, elements;
270 const char *pairname;
271
272 xdr_int(&p, &namelen);
273 pairname = (const char*) p;
274 p += roundup(namelen, 4);
275 xdr_int(&p, &pairtype);
276
277 for (i = 0; i < indent; i++)
278 printf(" ");
279 printf("%s %s", typenames[pairtype], pairname);
280
281 xdr_int(&p, &elements);
282 switch (pairtype) {
283 case DATA_TYPE_UINT64: {
284 uint64_t val;
285 xdr_uint64_t(&p, &val);
286 printf(" = 0x%llx\n", val);
287 break;
288 }
289
290 case DATA_TYPE_STRING: {
291 int len;
292 xdr_int(&p, &len);
293 printf(" = \"%s\"\n", p);
294 break;
295 }
296
297 case DATA_TYPE_NVLIST:
298 printf("\n");
299 nvlist_print(p, indent + 1);
300 break;
301
302 case DATA_TYPE_NVLIST_ARRAY:
303 for (j = 0; j < elements; j++) {
304 printf("[%d]\n", j);
305 p = nvlist_print(p, indent + 1);
306 if (j != elements - 1) {
307 for (i = 0; i < indent; i++)
308 printf(" ");
309 printf("%s %s", typenames[pairtype], pairname);
310 }
311 }
312 break;
313
314 default:
315 printf("\n");
316 }
317
318 p = pair + encoded_size;
319
320 pair = p;
321 xdr_int(&p, &encoded_size);
322 xdr_int(&p, &decoded_size);
323 }
324
325 return p;
326 }
327
328 #endif
329
330 static int
331 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
332 off_t offset, size_t size)
333 {
334 size_t psize;
335 int rc;
336
337 if (!vdev->v_phys_read)
338 return (EIO);
339
340 if (bp) {
341 psize = BP_GET_PSIZE(bp);
342 } else {
343 psize = size;
344 }
345
346 /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
347 rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
348 if (rc)
349 return (rc);
350 if (bp && zio_checksum_verify(bp, buf))
351 return (EIO);
352
353 return (0);
354 }
355
356 static int
357 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
358 off_t offset, size_t bytes)
359 {
360
361 return (vdev_read_phys(vdev, bp, buf,
362 offset + VDEV_LABEL_START_SIZE, bytes));
363 }
364
365
366 static int
367 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
368 off_t offset, size_t bytes)
369 {
370 vdev_t *kid;
371 int rc;
372
373 rc = EIO;
374 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
375 if (kid->v_state != VDEV_STATE_HEALTHY)
376 continue;
377 rc = kid->v_read(kid, bp, buf, offset, bytes);
378 if (!rc)
379 return (0);
380 }
381
382 return (rc);
383 }
384
385 static int
386 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
387 off_t offset, size_t bytes)
388 {
389 vdev_t *kid;
390
391 /*
392 * Here we should have two kids:
393 * First one which is the one we are replacing and we can trust
394 * only this one to have valid data, but it might not be present.
395 * Second one is that one we are replacing with. It is most likely
396 * healthy, but we can't trust it has needed data, so we won't use it.
397 */
398 kid = STAILQ_FIRST(&vdev->v_children);
399 if (kid == NULL)
400 return (EIO);
401 if (kid->v_state != VDEV_STATE_HEALTHY)
402 return (EIO);
403 return (kid->v_read(kid, bp, buf, offset, bytes));
404 }
405
406 static vdev_t *
407 vdev_find(uint64_t guid)
408 {
409 vdev_t *vdev;
410
411 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
412 if (vdev->v_guid == guid)
413 return (vdev);
414
415 return (0);
416 }
417
418 static vdev_t *
419 vdev_create(uint64_t guid, vdev_read_t *read)
420 {
421 vdev_t *vdev;
422
423 vdev = malloc(sizeof(vdev_t));
424 memset(vdev, 0, sizeof(vdev_t));
425 STAILQ_INIT(&vdev->v_children);
426 vdev->v_guid = guid;
427 vdev->v_state = VDEV_STATE_OFFLINE;
428 vdev->v_read = read;
429 vdev->v_phys_read = 0;
430 vdev->v_read_priv = 0;
431 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
432
433 return (vdev);
434 }
435
436 static int
437 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
438 vdev_t **vdevp, int is_newer)
439 {
440 int rc;
441 uint64_t guid, id, ashift, nparity;
442 const char *type;
443 const char *path;
444 vdev_t *vdev, *kid;
445 const unsigned char *kids;
446 int nkids, i, is_new;
447 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
448
449 if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
450 DATA_TYPE_UINT64, 0, &guid)
451 || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
452 DATA_TYPE_UINT64, 0, &id)
453 || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
454 DATA_TYPE_STRING, 0, &type)) {
455 printf("ZFS: can't find vdev details\n");
456 return (ENOENT);
457 }
458
459 if (strcmp(type, VDEV_TYPE_MIRROR)
460 && strcmp(type, VDEV_TYPE_DISK)
461 #ifdef ZFS_TEST
462 && strcmp(type, VDEV_TYPE_FILE)
463 #endif
464 && strcmp(type, VDEV_TYPE_RAIDZ)
465 && strcmp(type, VDEV_TYPE_REPLACING)) {
466 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
467 return (EIO);
468 }
469
470 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
471
472 nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
473 &is_offline);
474 nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
475 &is_removed);
476 nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
477 &is_faulted);
478 nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
479 &is_degraded);
480 nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
481 &isnt_present);
482
483 vdev = vdev_find(guid);
484 if (!vdev) {
485 is_new = 1;
486
487 if (!strcmp(type, VDEV_TYPE_MIRROR))
488 vdev = vdev_create(guid, vdev_mirror_read);
489 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
490 vdev = vdev_create(guid, vdev_raidz_read);
491 else if (!strcmp(type, VDEV_TYPE_REPLACING))
492 vdev = vdev_create(guid, vdev_replacing_read);
493 else
494 vdev = vdev_create(guid, vdev_disk_read);
495
496 vdev->v_id = id;
497 vdev->v_top = pvdev != NULL ? pvdev : vdev;
498 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
499 DATA_TYPE_UINT64, 0, &ashift) == 0)
500 vdev->v_ashift = ashift;
501 else
502 vdev->v_ashift = 0;
503 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
504 DATA_TYPE_UINT64, 0, &nparity) == 0)
505 vdev->v_nparity = nparity;
506 else
507 vdev->v_nparity = 0;
508 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
509 DATA_TYPE_STRING, 0, &path) == 0) {
510 if (strncmp(path, "/dev/", 5) == 0)
511 path += 5;
512 vdev->v_name = strdup(path);
513 } else {
514 if (!strcmp(type, "raidz")) {
515 if (vdev->v_nparity == 1)
516 vdev->v_name = "raidz1";
517 else if (vdev->v_nparity == 2)
518 vdev->v_name = "raidz2";
519 else if (vdev->v_nparity == 3)
520 vdev->v_name = "raidz3";
521 else {
522 printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
523 return (EIO);
524 }
525 } else {
526 vdev->v_name = strdup(type);
527 }
528 }
529 } else {
530 is_new = 0;
531 }
532
533 if (is_new || is_newer) {
534 /*
535 * This is either new vdev or we've already seen this vdev,
536 * but from an older vdev label, so let's refresh its state
537 * from the newer label.
538 */
539 if (is_offline)
540 vdev->v_state = VDEV_STATE_OFFLINE;
541 else if (is_removed)
542 vdev->v_state = VDEV_STATE_REMOVED;
543 else if (is_faulted)
544 vdev->v_state = VDEV_STATE_FAULTED;
545 else if (is_degraded)
546 vdev->v_state = VDEV_STATE_DEGRADED;
547 else if (isnt_present)
548 vdev->v_state = VDEV_STATE_CANT_OPEN;
549 }
550
551 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
552 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
553 /*
554 * Its ok if we don't have any kids.
555 */
556 if (rc == 0) {
557 vdev->v_nchildren = nkids;
558 for (i = 0; i < nkids; i++) {
559 rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
560 if (rc)
561 return (rc);
562 if (is_new)
563 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
564 v_childlink);
565 kids = nvlist_next(kids);
566 }
567 } else {
568 vdev->v_nchildren = 0;
569 }
570
571 if (vdevp)
572 *vdevp = vdev;
573 return (0);
574 }
575
576 static void
577 vdev_set_state(vdev_t *vdev)
578 {
579 vdev_t *kid;
580 int good_kids;
581 int bad_kids;
582
583 /*
584 * A mirror or raidz is healthy if all its kids are healthy. A
585 * mirror is degraded if any of its kids is healthy; a raidz
586 * is degraded if at most nparity kids are offline.
587 */
588 if (STAILQ_FIRST(&vdev->v_children)) {
589 good_kids = 0;
590 bad_kids = 0;
591 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
592 if (kid->v_state == VDEV_STATE_HEALTHY)
593 good_kids++;
594 else
595 bad_kids++;
596 }
597 if (bad_kids == 0) {
598 vdev->v_state = VDEV_STATE_HEALTHY;
599 } else {
600 if (vdev->v_read == vdev_mirror_read) {
601 if (good_kids) {
602 vdev->v_state = VDEV_STATE_DEGRADED;
603 } else {
604 vdev->v_state = VDEV_STATE_OFFLINE;
605 }
606 } else if (vdev->v_read == vdev_raidz_read) {
607 if (bad_kids > vdev->v_nparity) {
608 vdev->v_state = VDEV_STATE_OFFLINE;
609 } else {
610 vdev->v_state = VDEV_STATE_DEGRADED;
611 }
612 }
613 }
614 }
615 }
616
617 static spa_t *
618 spa_find_by_guid(uint64_t guid)
619 {
620 spa_t *spa;
621
622 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
623 if (spa->spa_guid == guid)
624 return (spa);
625
626 return (0);
627 }
628
629 #ifdef BOOT2
630
631 static spa_t *
632 spa_find_by_name(const char *name)
633 {
634 spa_t *spa;
635
636 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
637 if (!strcmp(spa->spa_name, name))
638 return (spa);
639
640 return (0);
641 }
642
643 #endif
644
645 static spa_t *
646 spa_create(uint64_t guid)
647 {
648 spa_t *spa;
649
650 spa = malloc(sizeof(spa_t));
651 memset(spa, 0, sizeof(spa_t));
652 STAILQ_INIT(&spa->spa_vdevs);
653 spa->spa_guid = guid;
654 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
655
656 return (spa);
657 }
658
659 static const char *
660 state_name(vdev_state_t state)
661 {
662 static const char* names[] = {
663 "UNKNOWN",
664 "CLOSED",
665 "OFFLINE",
666 "REMOVED",
667 "CANT_OPEN",
668 "FAULTED",
669 "DEGRADED",
670 "ONLINE"
671 };
672 return names[state];
673 }
674
675 #ifdef BOOT2
676
677 #define pager_printf printf
678
679 #else
680
681 static void
682 pager_printf(const char *fmt, ...)
683 {
684 char line[80];
685 va_list args;
686
687 va_start(args, fmt);
688 vsprintf(line, fmt, args);
689 va_end(args);
690 pager_output(line);
691 }
692
693 #endif
694
695 #define STATUS_FORMAT " %s %s\n"
696
697 static void
698 print_state(int indent, const char *name, vdev_state_t state)
699 {
700 int i;
701 char buf[512];
702
703 buf[0] = 0;
704 for (i = 0; i < indent; i++)
705 strcat(buf, " ");
706 strcat(buf, name);
707 pager_printf(STATUS_FORMAT, buf, state_name(state));
708
709 }
710
711 static void
712 vdev_status(vdev_t *vdev, int indent)
713 {
714 vdev_t *kid;
715 print_state(indent, vdev->v_name, vdev->v_state);
716
717 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
718 vdev_status(kid, indent + 1);
719 }
720 }
721
722 static void
723 spa_status(spa_t *spa)
724 {
725 vdev_t *vdev;
726 int good_kids, bad_kids, degraded_kids;
727 vdev_state_t state;
728
729 pager_printf(" pool: %s\n", spa->spa_name);
730 pager_printf("config:\n\n");
731 pager_printf(STATUS_FORMAT, "NAME", "STATE");
732
733 good_kids = 0;
734 degraded_kids = 0;
735 bad_kids = 0;
736 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
737 if (vdev->v_state == VDEV_STATE_HEALTHY)
738 good_kids++;
739 else if (vdev->v_state == VDEV_STATE_DEGRADED)
740 degraded_kids++;
741 else
742 bad_kids++;
743 }
744
745 state = VDEV_STATE_CLOSED;
746 if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
747 state = VDEV_STATE_HEALTHY;
748 else if ((good_kids + degraded_kids) > 0)
749 state = VDEV_STATE_DEGRADED;
750
751 print_state(0, spa->spa_name, state);
752 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
753 vdev_status(vdev, 1);
754 }
755 }
756
757 static void
758 spa_all_status(void)
759 {
760 spa_t *spa;
761 int first = 1;
762
763 STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
764 if (!first)
765 pager_printf("\n");
766 first = 0;
767 spa_status(spa);
768 }
769 }
770
771 static int
772 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
773 {
774 vdev_t vtmp;
775 vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
776 spa_t *spa;
777 vdev_t *vdev, *top_vdev, *pool_vdev;
778 off_t off;
779 blkptr_t bp;
780 const unsigned char *nvlist;
781 uint64_t val;
782 uint64_t guid;
783 uint64_t pool_txg, pool_guid;
784 uint64_t is_log;
785 const char *pool_name;
786 const unsigned char *vdevs;
787 int i, rc, is_newer;
788 char *upbuf;
789 const struct uberblock *up;
790
791 /*
792 * Load the vdev label and figure out which
793 * uberblock is most current.
794 */
795 memset(&vtmp, 0, sizeof(vtmp));
796 vtmp.v_phys_read = read;
797 vtmp.v_read_priv = read_priv;
798 off = offsetof(vdev_label_t, vl_vdev_phys);
799 BP_ZERO(&bp);
800 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
801 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
802 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
803 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
804 DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
805 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
806 if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
807 return (EIO);
808
809 if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
810 return (EIO);
811 }
812
813 nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
814
815 if (nvlist_find(nvlist,
816 ZPOOL_CONFIG_VERSION,
817 DATA_TYPE_UINT64, 0, &val)) {
818 return (EIO);
819 }
820
821 if (val > SPA_VERSION) {
822 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
823 (unsigned) val, (unsigned) SPA_VERSION);
824 return (EIO);
825 }
826
827 if (nvlist_find(nvlist,
828 ZPOOL_CONFIG_POOL_STATE,
829 DATA_TYPE_UINT64, 0, &val)) {
830 return (EIO);
831 }
832
833 if (val == POOL_STATE_DESTROYED) {
834 /* We don't boot only from destroyed pools. */
835 return (EIO);
836 }
837
838 if (nvlist_find(nvlist,
839 ZPOOL_CONFIG_POOL_TXG,
840 DATA_TYPE_UINT64, 0, &pool_txg)
841 || nvlist_find(nvlist,
842 ZPOOL_CONFIG_POOL_GUID,
843 DATA_TYPE_UINT64, 0, &pool_guid)
844 || nvlist_find(nvlist,
845 ZPOOL_CONFIG_POOL_NAME,
846 DATA_TYPE_STRING, 0, &pool_name)) {
847 /*
848 * Cache and spare devices end up here - just ignore
849 * them.
850 */
851 /*printf("ZFS: can't find pool details\n");*/
852 return (EIO);
853 }
854
855 is_log = 0;
856 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
857 &is_log);
858 if (is_log)
859 return (EIO);
860
861 /*
862 * Create the pool if this is the first time we've seen it.
863 */
864 spa = spa_find_by_guid(pool_guid);
865 if (!spa) {
866 spa = spa_create(pool_guid);
867 spa->spa_name = strdup(pool_name);
868 }
869 if (pool_txg > spa->spa_txg) {
870 spa->spa_txg = pool_txg;
871 is_newer = 1;
872 } else
873 is_newer = 0;
874
875 /*
876 * Get the vdev tree and create our in-core copy of it.
877 * If we already have a vdev with this guid, this must
878 * be some kind of alias (overlapping slices, dangerously dedicated
879 * disks etc).
880 */
881 if (nvlist_find(nvlist,
882 ZPOOL_CONFIG_GUID,
883 DATA_TYPE_UINT64, 0, &guid)) {
884 return (EIO);
885 }
886 vdev = vdev_find(guid);
887 if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
888 return (EIO);
889
890 if (nvlist_find(nvlist,
891 ZPOOL_CONFIG_VDEV_TREE,
892 DATA_TYPE_NVLIST, 0, &vdevs)) {
893 return (EIO);
894 }
895
896 rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
897 if (rc)
898 return (rc);
899
900 /*
901 * Add the toplevel vdev to the pool if its not already there.
902 */
903 STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
904 if (top_vdev == pool_vdev)
905 break;
906 if (!pool_vdev && top_vdev)
907 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
908
909 /*
910 * We should already have created an incomplete vdev for this
911 * vdev. Find it and initialise it with our read proc.
912 */
913 vdev = vdev_find(guid);
914 if (vdev) {
915 vdev->v_phys_read = read;
916 vdev->v_read_priv = read_priv;
917 vdev->v_state = VDEV_STATE_HEALTHY;
918 } else {
919 printf("ZFS: inconsistent nvlist contents\n");
920 return (EIO);
921 }
922
923 /*
924 * Re-evaluate top-level vdev state.
925 */
926 vdev_set_state(top_vdev);
927
928 /*
929 * Ok, we are happy with the pool so far. Lets find
930 * the best uberblock and then we can actually access
931 * the contents of the pool.
932 */
933 upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
934 up = (const struct uberblock *)upbuf;
935 for (i = 0;
936 i < VDEV_UBERBLOCK_COUNT(vdev);
937 i++) {
938 off = VDEV_UBERBLOCK_OFFSET(vdev, i);
939 BP_ZERO(&bp);
940 DVA_SET_OFFSET(&bp.blk_dva[0], off);
941 BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
942 BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
943 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
944 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
945 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
946
947 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
948 continue;
949
950 if (up->ub_magic != UBERBLOCK_MAGIC)
951 continue;
952 if (up->ub_txg < spa->spa_txg)
953 continue;
954 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
955 spa->spa_uberblock = *up;
956 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
957 if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
958 spa->spa_uberblock = *up;
959 }
960 }
961 zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
962
963 if (spap)
964 *spap = spa;
965 return (0);
966 }
967
968 static int
969 ilog2(int n)
970 {
971 int v;
972
973 for (v = 0; v < 32; v++)
974 if (n == (1 << v))
975 return v;
976 return -1;
977 }
978
979 static int
980 zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf)
981 {
982 blkptr_t gbh_bp;
983 zio_gbh_phys_t zio_gb;
984 char *pbuf;
985 int i;
986
987 /* Artificial BP for gang block header. */
988 gbh_bp = *bp;
989 BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
990 BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
991 BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
992 BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
993 for (i = 0; i < SPA_DVAS_PER_BP; i++)
994 DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
995
996 /* Read gang header block using the artificial BP. */
997 if (zio_read(spa, &gbh_bp, &zio_gb))
998 return (EIO);
999
1000 pbuf = buf;
1001 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1002 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1003
1004 if (BP_IS_HOLE(gbp))
1005 continue;
1006 if (zio_read(spa, gbp, pbuf))
1007 return (EIO);
1008 pbuf += BP_GET_PSIZE(gbp);
1009 }
1010
1011 if (zio_checksum_verify(bp, buf))
1012 return (EIO);
1013 return (0);
1014 }
1015
1016 static int
1017 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
1018 {
1019 int cpfunc = BP_GET_COMPRESS(bp);
1020 uint64_t align, size;
1021 void *pbuf;
1022 int i, error;
1023
1024 error = EIO;
1025
1026 for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1027 const dva_t *dva = &bp->blk_dva[i];
1028 vdev_t *vdev;
1029 int vdevid;
1030 off_t offset;
1031
1032 if (!dva->dva_word[0] && !dva->dva_word[1])
1033 continue;
1034
1035 vdevid = DVA_GET_VDEV(dva);
1036 offset = DVA_GET_OFFSET(dva);
1037 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1038 if (vdev->v_id == vdevid)
1039 break;
1040 }
1041 if (!vdev || !vdev->v_read)
1042 continue;
1043
1044 size = BP_GET_PSIZE(bp);
1045 if (vdev->v_read == vdev_raidz_read) {
1046 align = 1ULL << vdev->v_top->v_ashift;
1047 if (P2PHASE(size, align) != 0)
1048 size = P2ROUNDUP(size, align);
1049 }
1050 if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1051 pbuf = zfs_alloc(size);
1052 else
1053 pbuf = buf;
1054
1055 if (DVA_GET_GANG(dva))
1056 error = zio_read_gang(spa, bp, pbuf);
1057 else
1058 error = vdev->v_read(vdev, bp, pbuf, offset, size);
1059 if (error == 0) {
1060 if (cpfunc != ZIO_COMPRESS_OFF)
1061 error = zio_decompress_data(cpfunc, pbuf,
1062 BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1063 else if (size != BP_GET_PSIZE(bp))
1064 bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1065 }
1066 if (buf != pbuf)
1067 zfs_free(pbuf, size);
1068 if (error == 0)
1069 break;
1070 }
1071 if (error != 0)
1072 printf("ZFS: i/o error - all block copies unavailable\n");
1073 return (error);
1074 }
1075
1076 static int
1077 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1078 {
1079 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1080 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1081 int nlevels = dnode->dn_nlevels;
1082 int i, rc;
1083
1084 /*
1085 * Note: bsize may not be a power of two here so we need to do an
1086 * actual divide rather than a bitshift.
1087 */
1088 while (buflen > 0) {
1089 uint64_t bn = offset / bsize;
1090 int boff = offset % bsize;
1091 int ibn;
1092 const blkptr_t *indbp;
1093 blkptr_t bp;
1094
1095 if (bn > dnode->dn_maxblkid)
1096 return (EIO);
1097
1098 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1099 goto cached;
1100
1101 indbp = dnode->dn_blkptr;
1102 for (i = 0; i < nlevels; i++) {
1103 /*
1104 * Copy the bp from the indirect array so that
1105 * we can re-use the scratch buffer for multi-level
1106 * objects.
1107 */
1108 ibn = bn >> ((nlevels - i - 1) * ibshift);
1109 ibn &= ((1 << ibshift) - 1);
1110 bp = indbp[ibn];
1111 rc = zio_read(spa, &bp, dnode_cache_buf);
1112 if (rc)
1113 return (rc);
1114 indbp = (const blkptr_t *) dnode_cache_buf;
1115 }
1116 dnode_cache_obj = dnode;
1117 dnode_cache_bn = bn;
1118 cached:
1119
1120 /*
1121 * The buffer contains our data block. Copy what we
1122 * need from it and loop.
1123 */
1124 i = bsize - boff;
1125 if (i > buflen) i = buflen;
1126 memcpy(buf, &dnode_cache_buf[boff], i);
1127 buf = ((char*) buf) + i;
1128 offset += i;
1129 buflen -= i;
1130 }
1131
1132 return (0);
1133 }
1134
1135 /*
1136 * Lookup a value in a microzap directory. Assumes that the zap
1137 * scratch buffer contains the directory contents.
1138 */
1139 static int
1140 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1141 {
1142 const mzap_phys_t *mz;
1143 const mzap_ent_phys_t *mze;
1144 size_t size;
1145 int chunks, i;
1146
1147 /*
1148 * Microzap objects use exactly one block. Read the whole
1149 * thing.
1150 */
1151 size = dnode->dn_datablkszsec * 512;
1152
1153 mz = (const mzap_phys_t *) zap_scratch;
1154 chunks = size / MZAP_ENT_LEN - 1;
1155
1156 for (i = 0; i < chunks; i++) {
1157 mze = &mz->mz_chunk[i];
1158 if (!strcmp(mze->mze_name, name)) {
1159 *value = mze->mze_value;
1160 return (0);
1161 }
1162 }
1163
1164 return (ENOENT);
1165 }
1166
1167 /*
1168 * Compare a name with a zap leaf entry. Return non-zero if the name
1169 * matches.
1170 */
1171 static int
1172 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1173 {
1174 size_t namelen;
1175 const zap_leaf_chunk_t *nc;
1176 const char *p;
1177
1178 namelen = zc->l_entry.le_name_length;
1179
1180 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1181 p = name;
1182 while (namelen > 0) {
1183 size_t len;
1184 len = namelen;
1185 if (len > ZAP_LEAF_ARRAY_BYTES)
1186 len = ZAP_LEAF_ARRAY_BYTES;
1187 if (memcmp(p, nc->l_array.la_array, len))
1188 return (0);
1189 p += len;
1190 namelen -= len;
1191 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1192 }
1193
1194 return 1;
1195 }
1196
1197 /*
1198 * Extract a uint64_t value from a zap leaf entry.
1199 */
1200 static uint64_t
1201 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1202 {
1203 const zap_leaf_chunk_t *vc;
1204 int i;
1205 uint64_t value;
1206 const uint8_t *p;
1207
1208 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1209 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1210 value = (value << 8) | p[i];
1211 }
1212
1213 return value;
1214 }
1215
1216 /*
1217 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1218 * buffer contains the directory header.
1219 */
1220 static int
1221 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1222 {
1223 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1224 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1225 fat_zap_t z;
1226 uint64_t *ptrtbl;
1227 uint64_t hash;
1228 int rc;
1229
1230 if (zh.zap_magic != ZAP_MAGIC)
1231 return (EIO);
1232
1233 z.zap_block_shift = ilog2(bsize);
1234 z.zap_phys = (zap_phys_t *) zap_scratch;
1235
1236 /*
1237 * Figure out where the pointer table is and read it in if necessary.
1238 */
1239 if (zh.zap_ptrtbl.zt_blk) {
1240 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1241 zap_scratch, bsize);
1242 if (rc)
1243 return (rc);
1244 ptrtbl = (uint64_t *) zap_scratch;
1245 } else {
1246 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1247 }
1248
1249 hash = zap_hash(zh.zap_salt, name);
1250
1251 zap_leaf_t zl;
1252 zl.l_bs = z.zap_block_shift;
1253
1254 off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1255 zap_leaf_chunk_t *zc;
1256
1257 rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1258 if (rc)
1259 return (rc);
1260
1261 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1262
1263 /*
1264 * Make sure this chunk matches our hash.
1265 */
1266 if (zl.l_phys->l_hdr.lh_prefix_len > 0
1267 && zl.l_phys->l_hdr.lh_prefix
1268 != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1269 return (ENOENT);
1270
1271 /*
1272 * Hash within the chunk to find our entry.
1273 */
1274 int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1275 int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1276 h = zl.l_phys->l_hash[h];
1277 if (h == 0xffff)
1278 return (ENOENT);
1279 zc = &ZAP_LEAF_CHUNK(&zl, h);
1280 while (zc->l_entry.le_hash != hash) {
1281 if (zc->l_entry.le_next == 0xffff) {
1282 zc = 0;
1283 break;
1284 }
1285 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1286 }
1287 if (fzap_name_equal(&zl, zc, name)) {
1288 *value = fzap_leaf_value(&zl, zc);
1289 return (0);
1290 }
1291
1292 return (ENOENT);
1293 }
1294
1295 /*
1296 * Lookup a name in a zap object and return its value as a uint64_t.
1297 */
1298 static int
1299 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1300 {
1301 int rc;
1302 uint64_t zap_type;
1303 size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1304
1305 rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1306 if (rc)
1307 return (rc);
1308
1309 zap_type = *(uint64_t *) zap_scratch;
1310 if (zap_type == ZBT_MICRO)
1311 return mzap_lookup(spa, dnode, name, value);
1312 else if (zap_type == ZBT_HEADER)
1313 return fzap_lookup(spa, dnode, name, value);
1314 printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1315 return (EIO);
1316 }
1317
1318 #ifdef BOOT2
1319
1320 /*
1321 * List a microzap directory. Assumes that the zap scratch buffer contains
1322 * the directory contents.
1323 */
1324 static int
1325 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
1326 {
1327 const mzap_phys_t *mz;
1328 const mzap_ent_phys_t *mze;
1329 size_t size;
1330 int chunks, i;
1331
1332 /*
1333 * Microzap objects use exactly one block. Read the whole
1334 * thing.
1335 */
1336 size = dnode->dn_datablkszsec * 512;
1337 mz = (const mzap_phys_t *) zap_scratch;
1338 chunks = size / MZAP_ENT_LEN - 1;
1339
1340 for (i = 0; i < chunks; i++) {
1341 mze = &mz->mz_chunk[i];
1342 if (mze->mze_name[0])
1343 //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
1344 printf("%s\n", mze->mze_name);
1345 }
1346
1347 return (0);
1348 }
1349
1350 /*
1351 * List a fatzap directory. Assumes that the zap scratch buffer contains
1352 * the directory header.
1353 */
1354 static int
1355 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
1356 {
1357 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1358 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1359 fat_zap_t z;
1360 int i, j;
1361
1362 if (zh.zap_magic != ZAP_MAGIC)
1363 return (EIO);
1364
1365 z.zap_block_shift = ilog2(bsize);
1366 z.zap_phys = (zap_phys_t *) zap_scratch;
1367
1368 /*
1369 * This assumes that the leaf blocks start at block 1. The
1370 * documentation isn't exactly clear on this.
1371 */
1372 zap_leaf_t zl;
1373 zl.l_bs = z.zap_block_shift;
1374 for (i = 0; i < zh.zap_num_leafs; i++) {
1375 off_t off = (i + 1) << zl.l_bs;
1376 char name[256], *p;
1377 uint64_t value;
1378
1379 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1380 return (EIO);
1381
1382 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1383
1384 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1385 zap_leaf_chunk_t *zc, *nc;
1386 int namelen;
1387
1388 zc = &ZAP_LEAF_CHUNK(&zl, j);
1389 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1390 continue;
1391 namelen = zc->l_entry.le_name_length;
1392 if (namelen > sizeof(name))
1393 namelen = sizeof(name);
1394
1395 /*
1396 * Paste the name back together.
1397 */
1398 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1399 p = name;
1400 while (namelen > 0) {
1401 int len;
1402 len = namelen;
1403 if (len > ZAP_LEAF_ARRAY_BYTES)
1404 len = ZAP_LEAF_ARRAY_BYTES;
1405 memcpy(p, nc->l_array.la_array, len);
1406 p += len;
1407 namelen -= len;
1408 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1409 }
1410
1411 /*
1412 * Assume the first eight bytes of the value are
1413 * a uint64_t.
1414 */
1415 value = fzap_leaf_value(&zl, zc);
1416
1417 printf("%s 0x%llx\n", name, value);
1418 }
1419 }
1420
1421 return (0);
1422 }
1423
1424 /*
1425 * List a zap directory.
1426 */
1427 static int
1428 zap_list(spa_t *spa, const dnode_phys_t *dnode)
1429 {
1430 uint64_t zap_type;
1431 size_t size = dnode->dn_datablkszsec * 512;
1432
1433 if (dnode_read(spa, dnode, 0, zap_scratch, size))
1434 return (EIO);
1435
1436 zap_type = *(uint64_t *) zap_scratch;
1437 if (zap_type == ZBT_MICRO)
1438 return mzap_list(spa, dnode);
1439 else
1440 return fzap_list(spa, dnode);
1441 }
1442
1443 #endif
1444
1445 static int
1446 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1447 {
1448 off_t offset;
1449
1450 offset = objnum * sizeof(dnode_phys_t);
1451 return dnode_read(spa, &os->os_meta_dnode, offset,
1452 dnode, sizeof(dnode_phys_t));
1453 }
1454
1455 /*
1456 * Find the object set given the object number of its dataset object
1457 * and return its details in *objset
1458 */
1459 static int
1460 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1461 {
1462 dnode_phys_t dataset;
1463 dsl_dataset_phys_t *ds;
1464
1465 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1466 printf("ZFS: can't find dataset %llu\n", objnum);
1467 return (EIO);
1468 }
1469
1470 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1471 if (zio_read(spa, &ds->ds_bp, objset)) {
1472 printf("ZFS: can't read object set for dataset %llu\n", objnum);
1473 return (EIO);
1474 }
1475
1476 return (0);
1477 }
1478
1479 /*
1480 * Find the object set pointed to by the BOOTFS property or the root
1481 * dataset if there is none and return its details in *objset
1482 */
1483 static int
1484 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
1485 {
1486 dnode_phys_t dir, propdir;
1487 uint64_t props, bootfs, root;
1488
1489 /*
1490 * Start with the MOS directory object.
1491 */
1492 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1493 printf("ZFS: can't read MOS object directory\n");
1494 return (EIO);
1495 }
1496
1497 /*
1498 * Lookup the pool_props and see if we can find a bootfs.
1499 */
1500 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1501 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1502 && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1503 && bootfs != 0)
1504 return zfs_mount_dataset(spa, bootfs, objset);
1505
1506 /*
1507 * Lookup the root dataset directory
1508 */
1509 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1510 || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1511 printf("ZFS: can't find root dsl_dir\n");
1512 return (EIO);
1513 }
1514
1515 /*
1516 * Use the information from the dataset directory's bonus buffer
1517 * to find the dataset object and from that the object set itself.
1518 */
1519 dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1520 return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
1521 }
1522
1523 static int
1524 zfs_mount_pool(spa_t *spa)
1525 {
1526
1527 /*
1528 * Find the MOS and work our way in from there.
1529 */
1530 if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1531 printf("ZFS: can't read MOS\n");
1532 return (EIO);
1533 }
1534
1535 /*
1536 * Find the root object set
1537 */
1538 if (zfs_mount_root(spa, &spa->spa_root_objset)) {
1539 printf("Can't find root filesystem - giving up\n");
1540 return (EIO);
1541 }
1542
1543 return (0);
1544 }
1545
1546 static int
1547 zfs_dnode_stat(spa_t *spa, dnode_phys_t *dn, struct stat *sb)
1548 {
1549
1550 if (dn->dn_bonustype != DMU_OT_SA) {
1551 znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
1552
1553 sb->st_mode = zp->zp_mode;
1554 sb->st_uid = zp->zp_uid;
1555 sb->st_gid = zp->zp_gid;
1556 sb->st_size = zp->zp_size;
1557 } else {
1558 sa_hdr_phys_t *sahdrp;
1559 int hdrsize;
1560 size_t size = 0;
1561 void *buf = NULL;
1562
1563 if (dn->dn_bonuslen != 0)
1564 sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
1565 else {
1566 if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
1567 blkptr_t *bp = &dn->dn_spill;
1568 int error;
1569
1570 size = BP_GET_LSIZE(bp);
1571 buf = zfs_alloc(size);
1572 error = zio_read(spa, bp, buf);
1573 if (error != 0) {
1574 zfs_free(buf, size);
1575 return (error);
1576 }
1577 sahdrp = buf;
1578 } else {
1579 return (EIO);
1580 }
1581 }
1582 hdrsize = SA_HDR_SIZE(sahdrp);
1583 sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
1584 SA_MODE_OFFSET);
1585 sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
1586 SA_UID_OFFSET);
1587 sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
1588 SA_GID_OFFSET);
1589 sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
1590 SA_SIZE_OFFSET);
1591 if (buf != NULL)
1592 zfs_free(buf, size);
1593 }
1594
1595 return (0);
1596 }
1597
1598 /*
1599 * Lookup a file and return its dnode.
1600 */
1601 static int
1602 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
1603 {
1604 int rc;
1605 uint64_t objnum, rootnum, parentnum;
1606 dnode_phys_t dn;
1607 const char *p, *q;
1608 char element[256];
1609 char path[1024];
1610 int symlinks_followed = 0;
1611 struct stat sb;
1612
1613 if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
1614 printf("ZFS: unexpected object set type %llu\n",
1615 spa->spa_root_objset.os_type);
1616 return (EIO);
1617 }
1618
1619 /*
1620 * Get the root directory dnode.
1621 */
1622 rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
1623 if (rc)
1624 return (rc);
1625
1626 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
1627 if (rc)
1628 return (rc);
1629
1630 rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
1631 if (rc)
1632 return (rc);
1633
1634 objnum = rootnum;
1635 p = upath;
1636 while (p && *p) {
1637 while (*p == '/')
1638 p++;
1639 if (!*p)
1640 break;
1641 q = strchr(p, '/');
1642 if (q) {
1643 memcpy(element, p, q - p);
1644 element[q - p] = 0;
1645 p = q;
1646 } else {
1647 strcpy(element, p);
1648 p = 0;
1649 }
1650
1651 rc = zfs_dnode_stat(spa, &dn, &sb);
1652 if (rc)
1653 return (rc);
1654 if (!S_ISDIR(sb.st_mode))
1655 return (ENOTDIR);
1656
1657 parentnum = objnum;
1658 rc = zap_lookup(spa, &dn, element, &objnum);
1659 if (rc)
1660 return (rc);
1661 objnum = ZFS_DIRENT_OBJ(objnum);
1662
1663 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1664 if (rc)
1665 return (rc);
1666
1667 /*
1668 * Check for symlink.
1669 */
1670 rc = zfs_dnode_stat(spa, &dn, &sb);
1671 if (rc)
1672 return (rc);
1673 if (S_ISLNK(sb.st_mode)) {
1674 if (symlinks_followed > 10)
1675 return (EMLINK);
1676 symlinks_followed++;
1677
1678 /*
1679 * Read the link value and copy the tail of our
1680 * current path onto the end.
1681 */
1682 if (p)
1683 strcpy(&path[sb.st_size], p);
1684 else
1685 path[sb.st_size] = 0;
1686 if (sb.st_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
1687 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
1688 sb.st_size);
1689 } else {
1690 rc = dnode_read(spa, &dn, 0, path, sb.st_size);
1691 if (rc)
1692 return (rc);
1693 }
1694
1695 /*
1696 * Restart with the new path, starting either at
1697 * the root or at the parent depending whether or
1698 * not the link is relative.
1699 */
1700 p = path;
1701 if (*p == '/')
1702 objnum = rootnum;
1703 else
1704 objnum = parentnum;
1705 objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1706 }
1707 }
1708
1709 *dnode = dn;
1710 return (0);
1711 }
Cache object: 18f77a9046c83b24fd5ca2e2c936091f
|