FreeBSD/Linux Kernel Cross Reference
sys/boot/zfs/zfsimpl.c
1 /*-
2 * Copyright (c) 2007 Doug Rabson
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/8.2/sys/boot/zfs/zfsimpl.c 214014 2010-10-18 09:26:39Z pjd $");
29
30 /*
31 * Stand-alone ZFS file reader.
32 */
33
34 #include "zfsimpl.h"
35 #include "zfssubr.c"
36
37 /*
38 * List of all vdevs, chained through v_alllink.
39 */
40 static vdev_list_t zfs_vdevs;
41
42 /*
43 * List of all pools, chained through spa_link.
44 */
45 static spa_list_t zfs_pools;
46
47 static uint64_t zfs_crc64_table[256];
48 static const dnode_phys_t *dnode_cache_obj = 0;
49 static uint64_t dnode_cache_bn;
50 static char *dnode_cache_buf;
51 static char *zap_scratch;
52 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
53
54 #define TEMP_SIZE (1024 * 1024)
55
56 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
57
58 static void
59 zfs_init(void)
60 {
61 STAILQ_INIT(&zfs_vdevs);
62 STAILQ_INIT(&zfs_pools);
63
64 zfs_temp_buf = malloc(TEMP_SIZE);
65 zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
66 zfs_temp_ptr = zfs_temp_buf;
67 dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
68 zap_scratch = malloc(SPA_MAXBLOCKSIZE);
69
70 zfs_init_crc();
71 }
72
73 static char *
74 zfs_alloc_temp(size_t sz)
75 {
76 char *p;
77
78 if (zfs_temp_ptr + sz > zfs_temp_end) {
79 printf("ZFS: out of temporary buffer space\n");
80 for (;;) ;
81 }
82 p = zfs_temp_ptr;
83 zfs_temp_ptr += sz;
84
85 return (p);
86 }
87
88 static void
89 zfs_reset_temp(void)
90 {
91
92 zfs_temp_ptr = zfs_temp_buf;
93 }
94
95 static int
96 xdr_int(const unsigned char **xdr, int *ip)
97 {
98 *ip = ((*xdr)[0] << 24)
99 | ((*xdr)[1] << 16)
100 | ((*xdr)[2] << 8)
101 | ((*xdr)[3] << 0);
102 (*xdr) += 4;
103 return (0);
104 }
105
106 static int
107 xdr_u_int(const unsigned char **xdr, u_int *ip)
108 {
109 *ip = ((*xdr)[0] << 24)
110 | ((*xdr)[1] << 16)
111 | ((*xdr)[2] << 8)
112 | ((*xdr)[3] << 0);
113 (*xdr) += 4;
114 return (0);
115 }
116
117 static int
118 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
119 {
120 u_int hi, lo;
121
122 xdr_u_int(xdr, &hi);
123 xdr_u_int(xdr, &lo);
124 *lp = (((uint64_t) hi) << 32) | lo;
125 return (0);
126 }
127
128 static int
129 nvlist_find(const unsigned char *nvlist, const char *name, int type,
130 int* elementsp, void *valuep)
131 {
132 const unsigned char *p, *pair;
133 int junk;
134 int encoded_size, decoded_size;
135
136 p = nvlist;
137 xdr_int(&p, &junk);
138 xdr_int(&p, &junk);
139
140 pair = p;
141 xdr_int(&p, &encoded_size);
142 xdr_int(&p, &decoded_size);
143 while (encoded_size && decoded_size) {
144 int namelen, pairtype, elements;
145 const char *pairname;
146
147 xdr_int(&p, &namelen);
148 pairname = (const char*) p;
149 p += roundup(namelen, 4);
150 xdr_int(&p, &pairtype);
151
152 if (!memcmp(name, pairname, namelen) && type == pairtype) {
153 xdr_int(&p, &elements);
154 if (elementsp)
155 *elementsp = elements;
156 if (type == DATA_TYPE_UINT64) {
157 xdr_uint64_t(&p, (uint64_t *) valuep);
158 return (0);
159 } else if (type == DATA_TYPE_STRING) {
160 int len;
161 xdr_int(&p, &len);
162 (*(const char**) valuep) = (const char*) p;
163 return (0);
164 } else if (type == DATA_TYPE_NVLIST
165 || type == DATA_TYPE_NVLIST_ARRAY) {
166 (*(const unsigned char**) valuep) =
167 (const unsigned char*) p;
168 return (0);
169 } else {
170 return (EIO);
171 }
172 } else {
173 /*
174 * Not the pair we are looking for, skip to the next one.
175 */
176 p = pair + encoded_size;
177 }
178
179 pair = p;
180 xdr_int(&p, &encoded_size);
181 xdr_int(&p, &decoded_size);
182 }
183
184 return (EIO);
185 }
186
187 /*
188 * Return the next nvlist in an nvlist array.
189 */
190 static const unsigned char *
191 nvlist_next(const unsigned char *nvlist)
192 {
193 const unsigned char *p, *pair;
194 int junk;
195 int encoded_size, decoded_size;
196
197 p = nvlist;
198 xdr_int(&p, &junk);
199 xdr_int(&p, &junk);
200
201 pair = p;
202 xdr_int(&p, &encoded_size);
203 xdr_int(&p, &decoded_size);
204 while (encoded_size && decoded_size) {
205 p = pair + encoded_size;
206
207 pair = p;
208 xdr_int(&p, &encoded_size);
209 xdr_int(&p, &decoded_size);
210 }
211
212 return p;
213 }
214
215 #ifdef TEST
216
217 static const unsigned char *
218 nvlist_print(const unsigned char *nvlist, unsigned int indent)
219 {
220 static const char* typenames[] = {
221 "DATA_TYPE_UNKNOWN",
222 "DATA_TYPE_BOOLEAN",
223 "DATA_TYPE_BYTE",
224 "DATA_TYPE_INT16",
225 "DATA_TYPE_UINT16",
226 "DATA_TYPE_INT32",
227 "DATA_TYPE_UINT32",
228 "DATA_TYPE_INT64",
229 "DATA_TYPE_UINT64",
230 "DATA_TYPE_STRING",
231 "DATA_TYPE_BYTE_ARRAY",
232 "DATA_TYPE_INT16_ARRAY",
233 "DATA_TYPE_UINT16_ARRAY",
234 "DATA_TYPE_INT32_ARRAY",
235 "DATA_TYPE_UINT32_ARRAY",
236 "DATA_TYPE_INT64_ARRAY",
237 "DATA_TYPE_UINT64_ARRAY",
238 "DATA_TYPE_STRING_ARRAY",
239 "DATA_TYPE_HRTIME",
240 "DATA_TYPE_NVLIST",
241 "DATA_TYPE_NVLIST_ARRAY",
242 "DATA_TYPE_BOOLEAN_VALUE",
243 "DATA_TYPE_INT8",
244 "DATA_TYPE_UINT8",
245 "DATA_TYPE_BOOLEAN_ARRAY",
246 "DATA_TYPE_INT8_ARRAY",
247 "DATA_TYPE_UINT8_ARRAY"
248 };
249
250 unsigned int i, j;
251 const unsigned char *p, *pair;
252 int junk;
253 int encoded_size, decoded_size;
254
255 p = nvlist;
256 xdr_int(&p, &junk);
257 xdr_int(&p, &junk);
258
259 pair = p;
260 xdr_int(&p, &encoded_size);
261 xdr_int(&p, &decoded_size);
262 while (encoded_size && decoded_size) {
263 int namelen, pairtype, elements;
264 const char *pairname;
265
266 xdr_int(&p, &namelen);
267 pairname = (const char*) p;
268 p += roundup(namelen, 4);
269 xdr_int(&p, &pairtype);
270
271 for (i = 0; i < indent; i++)
272 printf(" ");
273 printf("%s %s", typenames[pairtype], pairname);
274
275 xdr_int(&p, &elements);
276 switch (pairtype) {
277 case DATA_TYPE_UINT64: {
278 uint64_t val;
279 xdr_uint64_t(&p, &val);
280 printf(" = 0x%llx\n", val);
281 break;
282 }
283
284 case DATA_TYPE_STRING: {
285 int len;
286 xdr_int(&p, &len);
287 printf(" = \"%s\"\n", p);
288 break;
289 }
290
291 case DATA_TYPE_NVLIST:
292 printf("\n");
293 nvlist_print(p, indent + 1);
294 break;
295
296 case DATA_TYPE_NVLIST_ARRAY:
297 for (j = 0; j < elements; j++) {
298 printf("[%d]\n", j);
299 p = nvlist_print(p, indent + 1);
300 if (j != elements - 1) {
301 for (i = 0; i < indent; i++)
302 printf(" ");
303 printf("%s %s", typenames[pairtype], pairname);
304 }
305 }
306 break;
307
308 default:
309 printf("\n");
310 }
311
312 p = pair + encoded_size;
313
314 pair = p;
315 xdr_int(&p, &encoded_size);
316 xdr_int(&p, &decoded_size);
317 }
318
319 return p;
320 }
321
322 #endif
323
324 static int
325 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
326 off_t offset, size_t size)
327 {
328 size_t psize;
329 int rc;
330
331 if (!vdev->v_phys_read)
332 return (EIO);
333
334 if (bp) {
335 psize = BP_GET_PSIZE(bp);
336 } else {
337 psize = size;
338 }
339
340 /*printf("ZFS: reading %d bytes at 0x%llx to %p\n", psize, offset, buf);*/
341 rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
342 if (rc)
343 return (rc);
344 if (bp && zio_checksum_error(bp, buf))
345 return (EIO);
346
347 return (0);
348 }
349
350 static int
351 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
352 off_t offset, size_t bytes)
353 {
354
355 return (vdev_read_phys(vdev, bp, buf,
356 offset + VDEV_LABEL_START_SIZE, bytes));
357 }
358
359
360 static int
361 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
362 off_t offset, size_t bytes)
363 {
364 vdev_t *kid;
365 int rc;
366
367 rc = EIO;
368 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
369 if (kid->v_state != VDEV_STATE_HEALTHY)
370 continue;
371 rc = kid->v_read(kid, bp, buf, offset, bytes);
372 if (!rc)
373 return (0);
374 }
375
376 return (rc);
377 }
378
379 static int
380 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
381 off_t offset, size_t bytes)
382 {
383 vdev_t *kid;
384
385 /*
386 * Here we should have two kids:
387 * First one which is the one we are replacing and we can trust
388 * only this one to have valid data, but it might not be present.
389 * Second one is that one we are replacing with. It is most likely
390 * healthy, but we can't trust it has needed data, so we won't use it.
391 */
392 kid = STAILQ_FIRST(&vdev->v_children);
393 if (kid == NULL)
394 return (EIO);
395 if (kid->v_state != VDEV_STATE_HEALTHY)
396 return (EIO);
397 return (kid->v_read(kid, bp, buf, offset, bytes));
398 }
399
400 static vdev_t *
401 vdev_find(uint64_t guid)
402 {
403 vdev_t *vdev;
404
405 STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
406 if (vdev->v_guid == guid)
407 return (vdev);
408
409 return (0);
410 }
411
412 static vdev_t *
413 vdev_create(uint64_t guid, vdev_read_t *read)
414 {
415 vdev_t *vdev;
416
417 vdev = malloc(sizeof(vdev_t));
418 memset(vdev, 0, sizeof(vdev_t));
419 STAILQ_INIT(&vdev->v_children);
420 vdev->v_guid = guid;
421 vdev->v_state = VDEV_STATE_OFFLINE;
422 vdev->v_read = read;
423 vdev->v_phys_read = 0;
424 vdev->v_read_priv = 0;
425 STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
426
427 return (vdev);
428 }
429
430 static int
431 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
432 {
433 int rc;
434 uint64_t guid, id, ashift, nparity;
435 const char *type;
436 const char *path;
437 vdev_t *vdev, *kid;
438 const unsigned char *kids;
439 int nkids, i, is_new;
440 uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
441
442 if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
443 DATA_TYPE_UINT64, 0, &guid)
444 || nvlist_find(nvlist, ZPOOL_CONFIG_ID,
445 DATA_TYPE_UINT64, 0, &id)
446 || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE,
447 DATA_TYPE_STRING, 0, &type)) {
448 printf("ZFS: can't find vdev details\n");
449 return (ENOENT);
450 }
451
452 if (strcmp(type, VDEV_TYPE_MIRROR)
453 && strcmp(type, VDEV_TYPE_DISK)
454 && strcmp(type, VDEV_TYPE_RAIDZ)
455 && strcmp(type, VDEV_TYPE_REPLACING)) {
456 printf("ZFS: can only boot from disk, mirror or raidz vdevs\n");
457 return (EIO);
458 }
459
460 is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
461
462 nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
463 &is_offline);
464 nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
465 &is_removed);
466 nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
467 &is_faulted);
468 nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
469 &is_degraded);
470 nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, 0,
471 &isnt_present);
472
473 vdev = vdev_find(guid);
474 if (!vdev) {
475 is_new = 1;
476
477 if (!strcmp(type, VDEV_TYPE_MIRROR))
478 vdev = vdev_create(guid, vdev_mirror_read);
479 else if (!strcmp(type, VDEV_TYPE_RAIDZ))
480 vdev = vdev_create(guid, vdev_raidz_read);
481 else if (!strcmp(type, VDEV_TYPE_REPLACING))
482 vdev = vdev_create(guid, vdev_replacing_read);
483 else
484 vdev = vdev_create(guid, vdev_disk_read);
485
486 vdev->v_id = id;
487 if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
488 DATA_TYPE_UINT64, 0, &ashift) == 0)
489 vdev->v_ashift = ashift;
490 else
491 vdev->v_ashift = 0;
492 if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
493 DATA_TYPE_UINT64, 0, &nparity) == 0)
494 vdev->v_nparity = nparity;
495 else
496 vdev->v_nparity = 0;
497 if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
498 DATA_TYPE_STRING, 0, &path) == 0) {
499 if (strncmp(path, "/dev/", 5) == 0)
500 path += 5;
501 vdev->v_name = strdup(path);
502 } else {
503 if (!strcmp(type, "raidz")) {
504 if (vdev->v_nparity == 1)
505 vdev->v_name = "raidz1";
506 else
507 vdev->v_name = "raidz2";
508 } else {
509 vdev->v_name = strdup(type);
510 }
511 }
512 } else {
513 is_new = 0;
514 }
515
516 if (is_new || is_newer) {
517 /*
518 * This is either new vdev or we've already seen this vdev,
519 * but from an older vdev label, so let's refresh its state
520 * from the newer label.
521 */
522 if (is_offline)
523 vdev->v_state = VDEV_STATE_OFFLINE;
524 else if (is_removed)
525 vdev->v_state = VDEV_STATE_REMOVED;
526 else if (is_faulted)
527 vdev->v_state = VDEV_STATE_FAULTED;
528 else if (is_degraded)
529 vdev->v_state = VDEV_STATE_DEGRADED;
530 else if (isnt_present)
531 vdev->v_state = VDEV_STATE_CANT_OPEN;
532 else
533 vdev->v_state = VDEV_STATE_HEALTHY;
534 }
535
536 rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
537 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
538 /*
539 * Its ok if we don't have any kids.
540 */
541 if (rc == 0) {
542 vdev->v_nchildren = nkids;
543 for (i = 0; i < nkids; i++) {
544 rc = vdev_init_from_nvlist(kids, &kid, is_newer);
545 if (rc)
546 return (rc);
547 if (is_new)
548 STAILQ_INSERT_TAIL(&vdev->v_children, kid,
549 v_childlink);
550 kids = nvlist_next(kids);
551 }
552 } else {
553 vdev->v_nchildren = 0;
554 }
555
556 if (vdevp)
557 *vdevp = vdev;
558 return (0);
559 }
560
561 static void
562 vdev_set_state(vdev_t *vdev)
563 {
564 vdev_t *kid;
565 int good_kids;
566 int bad_kids;
567
568 /*
569 * A mirror or raidz is healthy if all its kids are healthy. A
570 * mirror is degraded if any of its kids is healthy; a raidz
571 * is degraded if at most nparity kids are offline.
572 */
573 if (STAILQ_FIRST(&vdev->v_children)) {
574 good_kids = 0;
575 bad_kids = 0;
576 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
577 if (kid->v_state == VDEV_STATE_HEALTHY)
578 good_kids++;
579 else
580 bad_kids++;
581 }
582 if (bad_kids == 0) {
583 vdev->v_state = VDEV_STATE_HEALTHY;
584 } else {
585 if (vdev->v_read == vdev_mirror_read) {
586 if (good_kids) {
587 vdev->v_state = VDEV_STATE_DEGRADED;
588 } else {
589 vdev->v_state = VDEV_STATE_OFFLINE;
590 }
591 } else if (vdev->v_read == vdev_raidz_read) {
592 if (bad_kids > vdev->v_nparity) {
593 vdev->v_state = VDEV_STATE_OFFLINE;
594 } else {
595 vdev->v_state = VDEV_STATE_DEGRADED;
596 }
597 }
598 }
599 }
600 }
601
602 static spa_t *
603 spa_find_by_guid(uint64_t guid)
604 {
605 spa_t *spa;
606
607 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
608 if (spa->spa_guid == guid)
609 return (spa);
610
611 return (0);
612 }
613
614 #ifdef BOOT2
615
616 static spa_t *
617 spa_find_by_name(const char *name)
618 {
619 spa_t *spa;
620
621 STAILQ_FOREACH(spa, &zfs_pools, spa_link)
622 if (!strcmp(spa->spa_name, name))
623 return (spa);
624
625 return (0);
626 }
627
628 #endif
629
630 static spa_t *
631 spa_create(uint64_t guid)
632 {
633 spa_t *spa;
634
635 spa = malloc(sizeof(spa_t));
636 memset(spa, 0, sizeof(spa_t));
637 STAILQ_INIT(&spa->spa_vdevs);
638 spa->spa_guid = guid;
639 STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
640
641 return (spa);
642 }
643
644 static const char *
645 state_name(vdev_state_t state)
646 {
647 static const char* names[] = {
648 "UNKNOWN",
649 "CLOSED",
650 "OFFLINE",
651 "REMOVED",
652 "CANT_OPEN",
653 "FAULTED",
654 "DEGRADED",
655 "ONLINE"
656 };
657 return names[state];
658 }
659
660 #ifdef BOOT2
661
662 #define pager_printf printf
663
664 #else
665
666 static void
667 pager_printf(const char *fmt, ...)
668 {
669 char line[80];
670 va_list args;
671
672 va_start(args, fmt);
673 vsprintf(line, fmt, args);
674 va_end(args);
675 pager_output(line);
676 }
677
678 #endif
679
680 #define STATUS_FORMAT " %s %s\n"
681
682 static void
683 print_state(int indent, const char *name, vdev_state_t state)
684 {
685 int i;
686 char buf[512];
687
688 buf[0] = 0;
689 for (i = 0; i < indent; i++)
690 strcat(buf, " ");
691 strcat(buf, name);
692 pager_printf(STATUS_FORMAT, buf, state_name(state));
693
694 }
695
696 static void
697 vdev_status(vdev_t *vdev, int indent)
698 {
699 vdev_t *kid;
700 print_state(indent, vdev->v_name, vdev->v_state);
701
702 STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
703 vdev_status(kid, indent + 1);
704 }
705 }
706
707 static void
708 spa_status(spa_t *spa)
709 {
710 vdev_t *vdev;
711 int good_kids, bad_kids, degraded_kids;
712 vdev_state_t state;
713
714 pager_printf(" pool: %s\n", spa->spa_name);
715 pager_printf("config:\n\n");
716 pager_printf(STATUS_FORMAT, "NAME", "STATE");
717
718 good_kids = 0;
719 degraded_kids = 0;
720 bad_kids = 0;
721 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
722 if (vdev->v_state == VDEV_STATE_HEALTHY)
723 good_kids++;
724 else if (vdev->v_state == VDEV_STATE_DEGRADED)
725 degraded_kids++;
726 else
727 bad_kids++;
728 }
729
730 state = VDEV_STATE_CLOSED;
731 if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
732 state = VDEV_STATE_HEALTHY;
733 else if ((good_kids + degraded_kids) > 0)
734 state = VDEV_STATE_DEGRADED;
735
736 print_state(0, spa->spa_name, state);
737 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
738 vdev_status(vdev, 1);
739 }
740 }
741
742 static void
743 spa_all_status(void)
744 {
745 spa_t *spa;
746 int first = 1;
747
748 STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
749 if (!first)
750 pager_printf("\n");
751 first = 0;
752 spa_status(spa);
753 }
754 }
755
756 static int
757 vdev_probe(vdev_phys_read_t *read, void *read_priv, spa_t **spap)
758 {
759 vdev_t vtmp;
760 vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
761 spa_t *spa;
762 vdev_t *vdev, *top_vdev, *pool_vdev;
763 off_t off;
764 blkptr_t bp;
765 const unsigned char *nvlist;
766 uint64_t val;
767 uint64_t guid;
768 uint64_t pool_txg, pool_guid;
769 uint64_t is_log;
770 const char *pool_name;
771 const unsigned char *vdevs;
772 int i, rc, is_newer;
773 char upbuf[1024];
774 const struct uberblock *up;
775
776 /*
777 * Load the vdev label and figure out which
778 * uberblock is most current.
779 */
780 memset(&vtmp, 0, sizeof(vtmp));
781 vtmp.v_phys_read = read;
782 vtmp.v_read_priv = read_priv;
783 off = offsetof(vdev_label_t, vl_vdev_phys);
784 BP_ZERO(&bp);
785 BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
786 BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
787 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
788 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
789 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
790 if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0))
791 return (EIO);
792
793 if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) {
794 return (EIO);
795 }
796
797 nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
798
799 if (nvlist_find(nvlist,
800 ZPOOL_CONFIG_VERSION,
801 DATA_TYPE_UINT64, 0, &val)) {
802 return (EIO);
803 }
804
805 if (val > SPA_VERSION) {
806 printf("ZFS: unsupported ZFS version %u (should be %u)\n",
807 (unsigned) val, (unsigned) SPA_VERSION);
808 return (EIO);
809 }
810
811 if (nvlist_find(nvlist,
812 ZPOOL_CONFIG_POOL_STATE,
813 DATA_TYPE_UINT64, 0, &val)) {
814 return (EIO);
815 }
816
817 #ifndef TEST
818 if (val != POOL_STATE_ACTIVE) {
819 /*
820 * Don't print a message here. If we happen to reboot
821 * while where is an exported pool around, we don't
822 * need a cascade of confusing messages during boot.
823 */
824 /*printf("ZFS: pool is not active\n");*/
825 return (EIO);
826 }
827 #endif
828
829 if (nvlist_find(nvlist,
830 ZPOOL_CONFIG_POOL_TXG,
831 DATA_TYPE_UINT64, 0, &pool_txg)
832 || nvlist_find(nvlist,
833 ZPOOL_CONFIG_POOL_GUID,
834 DATA_TYPE_UINT64, 0, &pool_guid)
835 || nvlist_find(nvlist,
836 ZPOOL_CONFIG_POOL_NAME,
837 DATA_TYPE_STRING, 0, &pool_name)) {
838 /*
839 * Cache and spare devices end up here - just ignore
840 * them.
841 */
842 /*printf("ZFS: can't find pool details\n");*/
843 return (EIO);
844 }
845
846 is_log = 0;
847 (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, 0,
848 &is_log);
849 if (is_log)
850 return (EIO);
851
852 /*
853 * Create the pool if this is the first time we've seen it.
854 */
855 spa = spa_find_by_guid(pool_guid);
856 if (!spa) {
857 spa = spa_create(pool_guid);
858 spa->spa_name = strdup(pool_name);
859 }
860 if (pool_txg > spa->spa_txg) {
861 spa->spa_txg = pool_txg;
862 is_newer = 1;
863 } else
864 is_newer = 0;
865
866 /*
867 * Get the vdev tree and create our in-core copy of it.
868 * If we already have a vdev with this guid, this must
869 * be some kind of alias (overlapping slices, dangerously dedicated
870 * disks etc).
871 */
872 if (nvlist_find(nvlist,
873 ZPOOL_CONFIG_GUID,
874 DATA_TYPE_UINT64, 0, &guid)) {
875 return (EIO);
876 }
877 vdev = vdev_find(guid);
878 if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
879 return (EIO);
880
881 if (nvlist_find(nvlist,
882 ZPOOL_CONFIG_VDEV_TREE,
883 DATA_TYPE_NVLIST, 0, &vdevs)) {
884 return (EIO);
885 }
886
887 rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
888 if (rc)
889 return (rc);
890
891 /*
892 * Add the toplevel vdev to the pool if its not already there.
893 */
894 STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
895 if (top_vdev == pool_vdev)
896 break;
897 if (!pool_vdev && top_vdev)
898 STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
899
900 /*
901 * We should already have created an incomplete vdev for this
902 * vdev. Find it and initialise it with our read proc.
903 */
904 vdev = vdev_find(guid);
905 if (vdev) {
906 vdev->v_phys_read = read;
907 vdev->v_read_priv = read_priv;
908 } else {
909 printf("ZFS: inconsistent nvlist contents\n");
910 return (EIO);
911 }
912
913 /*
914 * Re-evaluate top-level vdev state.
915 */
916 vdev_set_state(top_vdev);
917
918 /*
919 * Ok, we are happy with the pool so far. Lets find
920 * the best uberblock and then we can actually access
921 * the contents of the pool.
922 */
923 for (i = 0;
924 i < VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT;
925 i++) {
926 off = offsetof(vdev_label_t, vl_uberblock);
927 off += i << UBERBLOCK_SHIFT;
928 BP_ZERO(&bp);
929 DVA_SET_OFFSET(&bp.blk_dva[0], off);
930 BP_SET_LSIZE(&bp, 1 << UBERBLOCK_SHIFT);
931 BP_SET_PSIZE(&bp, 1 << UBERBLOCK_SHIFT);
932 BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
933 BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
934 ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
935 if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
936 continue;
937
938 up = (const struct uberblock *) upbuf;
939 if (up->ub_magic != UBERBLOCK_MAGIC)
940 continue;
941 if (up->ub_txg < spa->spa_txg)
942 continue;
943 if (up->ub_txg > spa->spa_uberblock.ub_txg) {
944 spa->spa_uberblock = *up;
945 } else if (up->ub_txg == spa->spa_uberblock.ub_txg) {
946 if (up->ub_timestamp > spa->spa_uberblock.ub_timestamp)
947 spa->spa_uberblock = *up;
948 }
949 }
950
951 if (spap)
952 *spap = spa;
953 return (0);
954 }
955
956 static int
957 ilog2(int n)
958 {
959 int v;
960
961 for (v = 0; v < 32; v++)
962 if (n == (1 << v))
963 return v;
964 return -1;
965 }
966
967 static int
968 zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf)
969 {
970 zio_gbh_phys_t zio_gb;
971 vdev_t *vdev;
972 int vdevid;
973 off_t offset;
974 int i;
975
976 vdevid = DVA_GET_VDEV(dva);
977 offset = DVA_GET_OFFSET(dva);
978 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
979 if (vdev->v_id == vdevid)
980 break;
981 if (!vdev || !vdev->v_read)
982 return (EIO);
983 if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE))
984 return (EIO);
985
986 for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
987 blkptr_t *gbp = &zio_gb.zg_blkptr[i];
988
989 if (BP_IS_HOLE(gbp))
990 continue;
991 if (zio_read(spa, gbp, buf))
992 return (EIO);
993 buf = (char*)buf + BP_GET_PSIZE(gbp);
994 }
995
996 return (0);
997 }
998
999 static int
1000 zio_read(spa_t *spa, const blkptr_t *bp, void *buf)
1001 {
1002 int cpfunc = BP_GET_COMPRESS(bp);
1003 size_t lsize = BP_GET_LSIZE(bp);
1004 size_t psize = BP_GET_PSIZE(bp);
1005 void *pbuf;
1006 int i;
1007
1008 zfs_reset_temp();
1009 if (cpfunc != ZIO_COMPRESS_OFF)
1010 pbuf = zfs_alloc_temp(psize);
1011 else
1012 pbuf = buf;
1013
1014 for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1015 const dva_t *dva = &bp->blk_dva[i];
1016 vdev_t *vdev;
1017 int vdevid;
1018 off_t offset;
1019
1020 if (!dva->dva_word[0] && !dva->dva_word[1])
1021 continue;
1022
1023 if (DVA_GET_GANG(dva)) {
1024 if (zio_read_gang(spa, bp, dva, buf))
1025 continue;
1026 } else {
1027 vdevid = DVA_GET_VDEV(dva);
1028 offset = DVA_GET_OFFSET(dva);
1029 STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink)
1030 if (vdev->v_id == vdevid)
1031 break;
1032 if (!vdev || !vdev->v_read) {
1033 continue;
1034 }
1035 if (vdev->v_read(vdev, bp, pbuf, offset, psize))
1036 continue;
1037
1038 if (cpfunc != ZIO_COMPRESS_OFF) {
1039 if (zio_decompress_data(cpfunc, pbuf, psize,
1040 buf, lsize))
1041 return (EIO);
1042 }
1043 }
1044
1045 return (0);
1046 }
1047 printf("ZFS: i/o error - all block copies unavailable\n");
1048
1049 return (EIO);
1050 }
1051
1052 static int
1053 dnode_read(spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1054 {
1055 int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1056 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1057 int nlevels = dnode->dn_nlevels;
1058 int i, rc;
1059
1060 /*
1061 * Note: bsize may not be a power of two here so we need to do an
1062 * actual divide rather than a bitshift.
1063 */
1064 while (buflen > 0) {
1065 uint64_t bn = offset / bsize;
1066 int boff = offset % bsize;
1067 int ibn;
1068 const blkptr_t *indbp;
1069 blkptr_t bp;
1070
1071 if (bn > dnode->dn_maxblkid)
1072 return (EIO);
1073
1074 if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1075 goto cached;
1076
1077 indbp = dnode->dn_blkptr;
1078 for (i = 0; i < nlevels; i++) {
1079 /*
1080 * Copy the bp from the indirect array so that
1081 * we can re-use the scratch buffer for multi-level
1082 * objects.
1083 */
1084 ibn = bn >> ((nlevels - i - 1) * ibshift);
1085 ibn &= ((1 << ibshift) - 1);
1086 bp = indbp[ibn];
1087 rc = zio_read(spa, &bp, dnode_cache_buf);
1088 if (rc)
1089 return (rc);
1090 indbp = (const blkptr_t *) dnode_cache_buf;
1091 }
1092 dnode_cache_obj = dnode;
1093 dnode_cache_bn = bn;
1094 cached:
1095
1096 /*
1097 * The buffer contains our data block. Copy what we
1098 * need from it and loop.
1099 */
1100 i = bsize - boff;
1101 if (i > buflen) i = buflen;
1102 memcpy(buf, &dnode_cache_buf[boff], i);
1103 buf = ((char*) buf) + i;
1104 offset += i;
1105 buflen -= i;
1106 }
1107
1108 return (0);
1109 }
1110
1111 /*
1112 * Lookup a value in a microzap directory. Assumes that the zap
1113 * scratch buffer contains the directory contents.
1114 */
1115 static int
1116 mzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1117 {
1118 const mzap_phys_t *mz;
1119 const mzap_ent_phys_t *mze;
1120 size_t size;
1121 int chunks, i;
1122
1123 /*
1124 * Microzap objects use exactly one block. Read the whole
1125 * thing.
1126 */
1127 size = dnode->dn_datablkszsec * 512;
1128
1129 mz = (const mzap_phys_t *) zap_scratch;
1130 chunks = size / MZAP_ENT_LEN - 1;
1131
1132 for (i = 0; i < chunks; i++) {
1133 mze = &mz->mz_chunk[i];
1134 if (!strcmp(mze->mze_name, name)) {
1135 *value = mze->mze_value;
1136 return (0);
1137 }
1138 }
1139
1140 return (ENOENT);
1141 }
1142
1143 /*
1144 * Compare a name with a zap leaf entry. Return non-zero if the name
1145 * matches.
1146 */
1147 static int
1148 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1149 {
1150 size_t namelen;
1151 const zap_leaf_chunk_t *nc;
1152 const char *p;
1153
1154 namelen = zc->l_entry.le_name_length;
1155
1156 nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1157 p = name;
1158 while (namelen > 0) {
1159 size_t len;
1160 len = namelen;
1161 if (len > ZAP_LEAF_ARRAY_BYTES)
1162 len = ZAP_LEAF_ARRAY_BYTES;
1163 if (memcmp(p, nc->l_array.la_array, len))
1164 return (0);
1165 p += len;
1166 namelen -= len;
1167 nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1168 }
1169
1170 return 1;
1171 }
1172
1173 /*
1174 * Extract a uint64_t value from a zap leaf entry.
1175 */
1176 static uint64_t
1177 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1178 {
1179 const zap_leaf_chunk_t *vc;
1180 int i;
1181 uint64_t value;
1182 const uint8_t *p;
1183
1184 vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1185 for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1186 value = (value << 8) | p[i];
1187 }
1188
1189 return value;
1190 }
1191
1192 /*
1193 * Lookup a value in a fatzap directory. Assumes that the zap scratch
1194 * buffer contains the directory header.
1195 */
1196 static int
1197 fzap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1198 {
1199 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1200 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1201 fat_zap_t z;
1202 uint64_t *ptrtbl;
1203 uint64_t hash;
1204 int rc;
1205
1206 if (zh.zap_magic != ZAP_MAGIC)
1207 return (EIO);
1208
1209 z.zap_block_shift = ilog2(bsize);
1210 z.zap_phys = (zap_phys_t *) zap_scratch;
1211
1212 /*
1213 * Figure out where the pointer table is and read it in if necessary.
1214 */
1215 if (zh.zap_ptrtbl.zt_blk) {
1216 rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1217 zap_scratch, bsize);
1218 if (rc)
1219 return (rc);
1220 ptrtbl = (uint64_t *) zap_scratch;
1221 } else {
1222 ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1223 }
1224
1225 hash = zap_hash(zh.zap_salt, name);
1226
1227 zap_leaf_t zl;
1228 zl.l_bs = z.zap_block_shift;
1229
1230 off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1231 zap_leaf_chunk_t *zc;
1232
1233 rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1234 if (rc)
1235 return (rc);
1236
1237 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1238
1239 /*
1240 * Make sure this chunk matches our hash.
1241 */
1242 if (zl.l_phys->l_hdr.lh_prefix_len > 0
1243 && zl.l_phys->l_hdr.lh_prefix
1244 != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1245 return (ENOENT);
1246
1247 /*
1248 * Hash within the chunk to find our entry.
1249 */
1250 int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1251 int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1252 h = zl.l_phys->l_hash[h];
1253 if (h == 0xffff)
1254 return (ENOENT);
1255 zc = &ZAP_LEAF_CHUNK(&zl, h);
1256 while (zc->l_entry.le_hash != hash) {
1257 if (zc->l_entry.le_next == 0xffff) {
1258 zc = 0;
1259 break;
1260 }
1261 zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1262 }
1263 if (fzap_name_equal(&zl, zc, name)) {
1264 *value = fzap_leaf_value(&zl, zc);
1265 return (0);
1266 }
1267
1268 return (ENOENT);
1269 }
1270
1271 /*
1272 * Lookup a name in a zap object and return its value as a uint64_t.
1273 */
1274 static int
1275 zap_lookup(spa_t *spa, const dnode_phys_t *dnode, const char *name, uint64_t *value)
1276 {
1277 int rc;
1278 uint64_t zap_type;
1279 size_t size = dnode->dn_datablkszsec * 512;
1280
1281 rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1282 if (rc)
1283 return (rc);
1284
1285 zap_type = *(uint64_t *) zap_scratch;
1286 if (zap_type == ZBT_MICRO)
1287 return mzap_lookup(spa, dnode, name, value);
1288 else
1289 return fzap_lookup(spa, dnode, name, value);
1290 }
1291
1292 #ifdef BOOT2
1293
1294 /*
1295 * List a microzap directory. Assumes that the zap scratch buffer contains
1296 * the directory contents.
1297 */
1298 static int
1299 mzap_list(spa_t *spa, const dnode_phys_t *dnode)
1300 {
1301 const mzap_phys_t *mz;
1302 const mzap_ent_phys_t *mze;
1303 size_t size;
1304 int chunks, i;
1305
1306 /*
1307 * Microzap objects use exactly one block. Read the whole
1308 * thing.
1309 */
1310 size = dnode->dn_datablkszsec * 512;
1311 mz = (const mzap_phys_t *) zap_scratch;
1312 chunks = size / MZAP_ENT_LEN - 1;
1313
1314 for (i = 0; i < chunks; i++) {
1315 mze = &mz->mz_chunk[i];
1316 if (mze->mze_name[0])
1317 //printf("%-32s 0x%llx\n", mze->mze_name, mze->mze_value);
1318 printf("%s\n", mze->mze_name);
1319 }
1320
1321 return (0);
1322 }
1323
1324 /*
1325 * List a fatzap directory. Assumes that the zap scratch buffer contains
1326 * the directory header.
1327 */
1328 static int
1329 fzap_list(spa_t *spa, const dnode_phys_t *dnode)
1330 {
1331 int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1332 zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1333 fat_zap_t z;
1334 int i, j;
1335
1336 if (zh.zap_magic != ZAP_MAGIC)
1337 return (EIO);
1338
1339 z.zap_block_shift = ilog2(bsize);
1340 z.zap_phys = (zap_phys_t *) zap_scratch;
1341
1342 /*
1343 * This assumes that the leaf blocks start at block 1. The
1344 * documentation isn't exactly clear on this.
1345 */
1346 zap_leaf_t zl;
1347 zl.l_bs = z.zap_block_shift;
1348 for (i = 0; i < zh.zap_num_leafs; i++) {
1349 off_t off = (i + 1) << zl.l_bs;
1350 char name[256], *p;
1351 uint64_t value;
1352
1353 if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1354 return (EIO);
1355
1356 zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1357
1358 for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1359 zap_leaf_chunk_t *zc, *nc;
1360 int namelen;
1361
1362 zc = &ZAP_LEAF_CHUNK(&zl, j);
1363 if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1364 continue;
1365 namelen = zc->l_entry.le_name_length;
1366 if (namelen > sizeof(name))
1367 namelen = sizeof(name);
1368
1369 /*
1370 * Paste the name back together.
1371 */
1372 nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1373 p = name;
1374 while (namelen > 0) {
1375 int len;
1376 len = namelen;
1377 if (len > ZAP_LEAF_ARRAY_BYTES)
1378 len = ZAP_LEAF_ARRAY_BYTES;
1379 memcpy(p, nc->l_array.la_array, len);
1380 p += len;
1381 namelen -= len;
1382 nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1383 }
1384
1385 /*
1386 * Assume the first eight bytes of the value are
1387 * a uint64_t.
1388 */
1389 value = fzap_leaf_value(&zl, zc);
1390
1391 printf("%s 0x%llx\n", name, value);
1392 }
1393 }
1394
1395 return (0);
1396 }
1397
1398 /*
1399 * List a zap directory.
1400 */
1401 static int
1402 zap_list(spa_t *spa, const dnode_phys_t *dnode)
1403 {
1404 uint64_t zap_type;
1405 size_t size = dnode->dn_datablkszsec * 512;
1406
1407 if (dnode_read(spa, dnode, 0, zap_scratch, size))
1408 return (EIO);
1409
1410 zap_type = *(uint64_t *) zap_scratch;
1411 if (zap_type == ZBT_MICRO)
1412 return mzap_list(spa, dnode);
1413 else
1414 return fzap_list(spa, dnode);
1415 }
1416
1417 #endif
1418
1419 static int
1420 objset_get_dnode(spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1421 {
1422 off_t offset;
1423
1424 offset = objnum * sizeof(dnode_phys_t);
1425 return dnode_read(spa, &os->os_meta_dnode, offset,
1426 dnode, sizeof(dnode_phys_t));
1427 }
1428
1429 /*
1430 * Find the object set given the object number of its dataset object
1431 * and return its details in *objset
1432 */
1433 static int
1434 zfs_mount_dataset(spa_t *spa, uint64_t objnum, objset_phys_t *objset)
1435 {
1436 dnode_phys_t dataset;
1437 dsl_dataset_phys_t *ds;
1438
1439 if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1440 printf("ZFS: can't find dataset %llu\n", objnum);
1441 return (EIO);
1442 }
1443
1444 ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
1445 if (zio_read(spa, &ds->ds_bp, objset)) {
1446 printf("ZFS: can't read object set for dataset %llu\n", objnum);
1447 return (EIO);
1448 }
1449
1450 return (0);
1451 }
1452
1453 /*
1454 * Find the object set pointed to by the BOOTFS property or the root
1455 * dataset if there is none and return its details in *objset
1456 */
1457 static int
1458 zfs_mount_root(spa_t *spa, objset_phys_t *objset)
1459 {
1460 dnode_phys_t dir, propdir;
1461 uint64_t props, bootfs, root;
1462
1463 /*
1464 * Start with the MOS directory object.
1465 */
1466 if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
1467 printf("ZFS: can't read MOS object directory\n");
1468 return (EIO);
1469 }
1470
1471 /*
1472 * Lookup the pool_props and see if we can find a bootfs.
1473 */
1474 if (zap_lookup(spa, &dir, DMU_POOL_PROPS, &props) == 0
1475 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
1476 && zap_lookup(spa, &propdir, "bootfs", &bootfs) == 0
1477 && bootfs != 0)
1478 return zfs_mount_dataset(spa, bootfs, objset);
1479
1480 /*
1481 * Lookup the root dataset directory
1482 */
1483 if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, &root)
1484 || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
1485 printf("ZFS: can't find root dsl_dir\n");
1486 return (EIO);
1487 }
1488
1489 /*
1490 * Use the information from the dataset directory's bonus buffer
1491 * to find the dataset object and from that the object set itself.
1492 */
1493 dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
1494 return zfs_mount_dataset(spa, dd->dd_head_dataset_obj, objset);
1495 }
1496
1497 static int
1498 zfs_mount_pool(spa_t *spa)
1499 {
1500 /*
1501 * Find the MOS and work our way in from there.
1502 */
1503 if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
1504 printf("ZFS: can't read MOS\n");
1505 return (EIO);
1506 }
1507
1508 /*
1509 * Find the root object set
1510 */
1511 if (zfs_mount_root(spa, &spa->spa_root_objset)) {
1512 printf("Can't find root filesystem - giving up\n");
1513 return (EIO);
1514 }
1515
1516 return (0);
1517 }
1518
1519 /*
1520 * Lookup a file and return its dnode.
1521 */
1522 static int
1523 zfs_lookup(spa_t *spa, const char *upath, dnode_phys_t *dnode)
1524 {
1525 int rc;
1526 uint64_t objnum, rootnum, parentnum;
1527 dnode_phys_t dn;
1528 const znode_phys_t *zp = (const znode_phys_t *) dn.dn_bonus;
1529 const char *p, *q;
1530 char element[256];
1531 char path[1024];
1532 int symlinks_followed = 0;
1533
1534 if (spa->spa_root_objset.os_type != DMU_OST_ZFS) {
1535 printf("ZFS: unexpected object set type %llu\n",
1536 spa->spa_root_objset.os_type);
1537 return (EIO);
1538 }
1539
1540 /*
1541 * Get the root directory dnode.
1542 */
1543 rc = objset_get_dnode(spa, &spa->spa_root_objset, MASTER_NODE_OBJ, &dn);
1544 if (rc)
1545 return (rc);
1546
1547 rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, &rootnum);
1548 if (rc)
1549 return (rc);
1550
1551 rc = objset_get_dnode(spa, &spa->spa_root_objset, rootnum, &dn);
1552 if (rc)
1553 return (rc);
1554
1555 objnum = rootnum;
1556 p = upath;
1557 while (p && *p) {
1558 while (*p == '/')
1559 p++;
1560 if (!*p)
1561 break;
1562 q = strchr(p, '/');
1563 if (q) {
1564 memcpy(element, p, q - p);
1565 element[q - p] = 0;
1566 p = q;
1567 } else {
1568 strcpy(element, p);
1569 p = 0;
1570 }
1571
1572 if ((zp->zp_mode >> 12) != 0x4) {
1573 return (ENOTDIR);
1574 }
1575
1576 parentnum = objnum;
1577 rc = zap_lookup(spa, &dn, element, &objnum);
1578 if (rc)
1579 return (rc);
1580 objnum = ZFS_DIRENT_OBJ(objnum);
1581
1582 rc = objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1583 if (rc)
1584 return (rc);
1585
1586 /*
1587 * Check for symlink.
1588 */
1589 if ((zp->zp_mode >> 12) == 0xa) {
1590 if (symlinks_followed > 10)
1591 return (EMLINK);
1592 symlinks_followed++;
1593
1594 /*
1595 * Read the link value and copy the tail of our
1596 * current path onto the end.
1597 */
1598 if (p)
1599 strcpy(&path[zp->zp_size], p);
1600 else
1601 path[zp->zp_size] = 0;
1602 if (zp->zp_size + sizeof(znode_phys_t) <= dn.dn_bonuslen) {
1603 memcpy(path, &dn.dn_bonus[sizeof(znode_phys_t)],
1604 zp->zp_size);
1605 } else {
1606 rc = dnode_read(spa, &dn, 0, path, zp->zp_size);
1607 if (rc)
1608 return (rc);
1609 }
1610
1611 /*
1612 * Restart with the new path, starting either at
1613 * the root or at the parent depending whether or
1614 * not the link is relative.
1615 */
1616 p = path;
1617 if (*p == '/')
1618 objnum = rootnum;
1619 else
1620 objnum = parentnum;
1621 objset_get_dnode(spa, &spa->spa_root_objset, objnum, &dn);
1622 }
1623 }
1624
1625 *dnode = dn;
1626 return (0);
1627 }
Cache object: b2c0de5d6abcf6566069c0f0c246ecab
|