1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
26 *
27 * Portions Copyright 2010 Robert Milkowski
28 *
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
33 */
34
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37 /*
38 * ZFS volume emulation driver.
39 *
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
42 *
43 * /dev/zvol/<pool_name>/<dataset_name>
44 *
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
47 *
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
50 */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95 #include <sys/freebsd_event.h>
96
97 #include <geom/geom.h>
98 #include <sys/zvol.h>
99 #include <sys/zvol_impl.h>
100
101 #include "zfs_namecheck.h"
102
103 #define ZVOL_DUMPSIZE "dumpsize"
104
105 #ifdef ZVOL_LOCK_DEBUG
106 #define ZVOL_RW_READER RW_WRITER
107 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
108 #else
109 #define ZVOL_RW_READER RW_READER
110 #define ZVOL_RW_READ_HELD RW_READ_HELD
111 #endif
112
113 enum zvol_geom_state {
114 ZVOL_GEOM_UNINIT,
115 ZVOL_GEOM_STOPPED,
116 ZVOL_GEOM_RUNNING,
117 };
118
119 struct zvol_state_os {
120 #define zso_dev _zso_state._zso_dev
121 #define zso_geom _zso_state._zso_geom
122 union {
123 /* volmode=dev */
124 struct zvol_state_dev {
125 struct cdev *zsd_cdev;
126 uint64_t zsd_sync_cnt;
127 struct selinfo zsd_selinfo;
128 } _zso_dev;
129
130 /* volmode=geom */
131 struct zvol_state_geom {
132 struct g_provider *zsg_provider;
133 struct bio_queue_head zsg_queue;
134 struct mtx zsg_queue_mtx;
135 enum zvol_geom_state zsg_state;
136 } _zso_geom;
137 } _zso_state;
138 int zso_dying;
139 };
140
141 static uint32_t zvol_minors;
142
143 SYSCTL_DECL(_vfs_zfs);
144 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
145 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
146 "Expose as GEOM providers (1), device files (2) or neither");
147 static boolean_t zpool_on_zvol = B_FALSE;
148 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
149 "Allow zpools to use zvols as vdevs (DANGEROUS)");
150
151 /*
152 * Toggle unmap functionality.
153 */
154 boolean_t zvol_unmap_enabled = B_TRUE;
155
156 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
157 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158
159 /*
160 * zvol maximum transfer in one DMU tx.
161 */
162 int zvol_maxphys = DMU_MAX_ACCESS / 2;
163
164 static void zvol_ensure_zilog(zvol_state_t *zv);
165
166 static d_open_t zvol_cdev_open;
167 static d_close_t zvol_cdev_close;
168 static d_ioctl_t zvol_cdev_ioctl;
169 static d_read_t zvol_cdev_read;
170 static d_write_t zvol_cdev_write;
171 static d_strategy_t zvol_geom_bio_strategy;
172 static d_kqfilter_t zvol_cdev_kqfilter;
173
174 static struct cdevsw zvol_cdevsw = {
175 .d_name = "zvol",
176 .d_version = D_VERSION,
177 .d_flags = D_DISK | D_TRACKCLOSE,
178 .d_open = zvol_cdev_open,
179 .d_close = zvol_cdev_close,
180 .d_ioctl = zvol_cdev_ioctl,
181 .d_read = zvol_cdev_read,
182 .d_write = zvol_cdev_write,
183 .d_strategy = zvol_geom_bio_strategy,
184 .d_kqfilter = zvol_cdev_kqfilter,
185 };
186
187 static void zvol_filter_detach(struct knote *kn);
188 static int zvol_filter_vnode(struct knote *kn, long hint);
189
190 static struct filterops zvol_filterops_vnode = {
191 .f_isfd = 1,
192 .f_detach = zvol_filter_detach,
193 .f_event = zvol_filter_vnode,
194 };
195
196 extern uint_t zfs_geom_probe_vdev_key;
197
198 struct g_class zfs_zvol_class = {
199 .name = "ZFS::ZVOL",
200 .version = G_VERSION,
201 };
202
203 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
204
205 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
206 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
207 static void zvol_geom_run(zvol_state_t *zv);
208 static void zvol_geom_destroy(zvol_state_t *zv);
209 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
210 static void zvol_geom_worker(void *arg);
211 static void zvol_geom_bio_start(struct bio *bp);
212 static int zvol_geom_bio_getattr(struct bio *bp);
213 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
214
215 /*
216 * GEOM mode implementation
217 */
218
219 static int
220 zvol_geom_open(struct g_provider *pp, int flag, int count)
221 {
222 zvol_state_t *zv;
223 int err = 0;
224 boolean_t drop_suspend = B_FALSE;
225
226 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
227 /*
228 * If zfs_geom_probe_vdev_key is set, that means that zfs is
229 * attempting to probe geom providers while looking for a
230 * replacement for a missing VDEV. In this case, the
231 * spa_namespace_lock will not be held, but it is still illegal
232 * to use a zvol as a vdev. Deadlocks can result if another
233 * thread has spa_namespace_lock.
234 */
235 return (SET_ERROR(EOPNOTSUPP));
236 }
237
238 retry:
239 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
240 /*
241 * Obtain a copy of private under zvol_state_lock to make sure either
242 * the result of zvol free code setting private to NULL is observed,
243 * or the zv is protected from being freed because of the positive
244 * zv_open_count.
245 */
246 zv = pp->private;
247 if (zv == NULL) {
248 rw_exit(&zvol_state_lock);
249 err = SET_ERROR(ENXIO);
250 goto out_locked;
251 }
252
253 mutex_enter(&zv->zv_state_lock);
254 if (zv->zv_zso->zso_dying) {
255 rw_exit(&zvol_state_lock);
256 err = SET_ERROR(ENXIO);
257 goto out_zv_locked;
258 }
259 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
260
261 /*
262 * Make sure zvol is not suspended during first open
263 * (hold zv_suspend_lock) and respect proper lock acquisition
264 * ordering - zv_suspend_lock before zv_state_lock.
265 */
266 if (zv->zv_open_count == 0) {
267 drop_suspend = B_TRUE;
268 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
269 mutex_exit(&zv->zv_state_lock);
270 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
271 mutex_enter(&zv->zv_state_lock);
272 /* Check to see if zv_suspend_lock is needed. */
273 if (zv->zv_open_count != 0) {
274 rw_exit(&zv->zv_suspend_lock);
275 drop_suspend = B_FALSE;
276 }
277 }
278 }
279 rw_exit(&zvol_state_lock);
280
281 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
282
283 if (zv->zv_open_count == 0) {
284 boolean_t drop_namespace = B_FALSE;
285
286 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
287
288 /*
289 * Take spa_namespace_lock to prevent lock inversion when
290 * zvols from one pool are opened as vdevs in another.
291 */
292 if (!mutex_owned(&spa_namespace_lock)) {
293 if (!mutex_tryenter(&spa_namespace_lock)) {
294 mutex_exit(&zv->zv_state_lock);
295 rw_exit(&zv->zv_suspend_lock);
296 kern_yield(PRI_USER);
297 goto retry;
298 } else {
299 drop_namespace = B_TRUE;
300 }
301 }
302 err = zvol_first_open(zv, !(flag & FWRITE));
303 if (drop_namespace)
304 mutex_exit(&spa_namespace_lock);
305 if (err)
306 goto out_zv_locked;
307 pp->mediasize = zv->zv_volsize;
308 pp->stripeoffset = 0;
309 pp->stripesize = zv->zv_volblocksize;
310 }
311
312 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
313
314 /*
315 * Check for a bad on-disk format version now since we
316 * lied about owning the dataset readonly before.
317 */
318 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
319 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
320 err = SET_ERROR(EROFS);
321 goto out_opened;
322 }
323 if (zv->zv_flags & ZVOL_EXCL) {
324 err = SET_ERROR(EBUSY);
325 goto out_opened;
326 }
327 if (flag & O_EXCL) {
328 if (zv->zv_open_count != 0) {
329 err = SET_ERROR(EBUSY);
330 goto out_opened;
331 }
332 zv->zv_flags |= ZVOL_EXCL;
333 }
334
335 zv->zv_open_count += count;
336 out_opened:
337 if (zv->zv_open_count == 0) {
338 zvol_last_close(zv);
339 wakeup(zv);
340 }
341 out_zv_locked:
342 mutex_exit(&zv->zv_state_lock);
343 out_locked:
344 if (drop_suspend)
345 rw_exit(&zv->zv_suspend_lock);
346 return (err);
347 }
348
349 static int
350 zvol_geom_close(struct g_provider *pp, int flag, int count)
351 {
352 (void) flag;
353 zvol_state_t *zv;
354 boolean_t drop_suspend = B_TRUE;
355 int new_open_count;
356
357 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
358 zv = pp->private;
359 if (zv == NULL) {
360 rw_exit(&zvol_state_lock);
361 return (SET_ERROR(ENXIO));
362 }
363
364 mutex_enter(&zv->zv_state_lock);
365 if (zv->zv_flags & ZVOL_EXCL) {
366 ASSERT3U(zv->zv_open_count, ==, 1);
367 zv->zv_flags &= ~ZVOL_EXCL;
368 }
369
370 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
371
372 /*
373 * If the open count is zero, this is a spurious close.
374 * That indicates a bug in the kernel / DDI framework.
375 */
376 ASSERT3U(zv->zv_open_count, >, 0);
377
378 /*
379 * Make sure zvol is not suspended during last close
380 * (hold zv_suspend_lock) and respect proper lock acquisition
381 * ordering - zv_suspend_lock before zv_state_lock.
382 */
383 new_open_count = zv->zv_open_count - count;
384 if (new_open_count == 0) {
385 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
386 mutex_exit(&zv->zv_state_lock);
387 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
388 mutex_enter(&zv->zv_state_lock);
389 /* Check to see if zv_suspend_lock is needed. */
390 new_open_count = zv->zv_open_count - count;
391 if (new_open_count != 0) {
392 rw_exit(&zv->zv_suspend_lock);
393 drop_suspend = B_FALSE;
394 }
395 }
396 } else {
397 drop_suspend = B_FALSE;
398 }
399 rw_exit(&zvol_state_lock);
400
401 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
402
403 /*
404 * You may get multiple opens, but only one close.
405 */
406 zv->zv_open_count = new_open_count;
407 if (zv->zv_open_count == 0) {
408 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
409 zvol_last_close(zv);
410 wakeup(zv);
411 }
412
413 mutex_exit(&zv->zv_state_lock);
414
415 if (drop_suspend)
416 rw_exit(&zv->zv_suspend_lock);
417 return (0);
418 }
419
420 static void
421 zvol_geom_run(zvol_state_t *zv)
422 {
423 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
424 struct g_provider *pp = zsg->zsg_provider;
425
426 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
427
428 g_error_provider(pp, 0);
429
430 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
431 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
432 }
433
434 static void
435 zvol_geom_destroy(zvol_state_t *zv)
436 {
437 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
438 struct g_provider *pp = zsg->zsg_provider;
439
440 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
441
442 g_topology_assert();
443
444 mutex_enter(&zv->zv_state_lock);
445 VERIFY3S(zsg->zsg_state, ==, ZVOL_GEOM_RUNNING);
446 mutex_exit(&zv->zv_state_lock);
447 zsg->zsg_provider = NULL;
448 g_wither_geom(pp->geom, ENXIO);
449 }
450
451 void
452 zvol_wait_close(zvol_state_t *zv)
453 {
454
455 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
456 return;
457 mutex_enter(&zv->zv_state_lock);
458 zv->zv_zso->zso_dying = B_TRUE;
459
460 if (zv->zv_open_count)
461 msleep(zv, &zv->zv_state_lock,
462 PRIBIO, "zvol:dying", 10*hz);
463 mutex_exit(&zv->zv_state_lock);
464 }
465
466
467 static int
468 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
469 {
470 int count, error, flags;
471
472 g_topology_assert();
473
474 /*
475 * To make it easier we expect either open or close, but not both
476 * at the same time.
477 */
478 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
479 (acr <= 0 && acw <= 0 && ace <= 0),
480 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
481 pp->name, acr, acw, ace));
482
483 if (pp->private == NULL) {
484 if (acr <= 0 && acw <= 0 && ace <= 0)
485 return (0);
486 return (pp->error);
487 }
488
489 /*
490 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
491 * ace != 0, because GEOM already handles that and handles it a bit
492 * differently. GEOM allows for multiple read/exclusive consumers and
493 * ZFS allows only one exclusive consumer, no matter if it is reader or
494 * writer. I like better the way GEOM works so I'll leave it for GEOM
495 * to decide what to do.
496 */
497
498 count = acr + acw + ace;
499 if (count == 0)
500 return (0);
501
502 flags = 0;
503 if (acr != 0 || ace != 0)
504 flags |= FREAD;
505 if (acw != 0)
506 flags |= FWRITE;
507
508 g_topology_unlock();
509 if (count > 0)
510 error = zvol_geom_open(pp, flags, count);
511 else
512 error = zvol_geom_close(pp, flags, -count);
513 g_topology_lock();
514 return (error);
515 }
516
517 static void
518 zvol_geom_worker(void *arg)
519 {
520 zvol_state_t *zv = arg;
521 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
522 struct bio *bp;
523
524 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
525
526 thread_lock(curthread);
527 sched_prio(curthread, PRIBIO);
528 thread_unlock(curthread);
529
530 for (;;) {
531 mtx_lock(&zsg->zsg_queue_mtx);
532 bp = bioq_takefirst(&zsg->zsg_queue);
533 if (bp == NULL) {
534 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
535 zsg->zsg_state = ZVOL_GEOM_RUNNING;
536 wakeup(&zsg->zsg_state);
537 mtx_unlock(&zsg->zsg_queue_mtx);
538 kthread_exit();
539 }
540 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
541 PRIBIO | PDROP, "zvol:io", 0);
542 continue;
543 }
544 mtx_unlock(&zsg->zsg_queue_mtx);
545 zvol_geom_bio_strategy(bp);
546 }
547 }
548
549 static void
550 zvol_geom_bio_start(struct bio *bp)
551 {
552 zvol_state_t *zv = bp->bio_to->private;
553 struct zvol_state_geom *zsg;
554 boolean_t first;
555
556 if (zv == NULL) {
557 g_io_deliver(bp, ENXIO);
558 return;
559 }
560 if (bp->bio_cmd == BIO_GETATTR) {
561 if (zvol_geom_bio_getattr(bp))
562 g_io_deliver(bp, EOPNOTSUPP);
563 return;
564 }
565
566 if (!THREAD_CAN_SLEEP()) {
567 zsg = &zv->zv_zso->zso_geom;
568 mtx_lock(&zsg->zsg_queue_mtx);
569 first = (bioq_first(&zsg->zsg_queue) == NULL);
570 bioq_insert_tail(&zsg->zsg_queue, bp);
571 mtx_unlock(&zsg->zsg_queue_mtx);
572 if (first)
573 wakeup_one(&zsg->zsg_queue);
574 return;
575 }
576
577 zvol_geom_bio_strategy(bp);
578 }
579
580 static int
581 zvol_geom_bio_getattr(struct bio *bp)
582 {
583 zvol_state_t *zv;
584
585 zv = bp->bio_to->private;
586 ASSERT3P(zv, !=, NULL);
587
588 spa_t *spa = dmu_objset_spa(zv->zv_objset);
589 uint64_t refd, avail, usedobjs, availobjs;
590
591 if (g_handleattr_int(bp, "GEOM::candelete", 1))
592 return (0);
593 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
594 dmu_objset_space(zv->zv_objset, &refd, &avail,
595 &usedobjs, &availobjs);
596 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
597 return (0);
598 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
599 dmu_objset_space(zv->zv_objset, &refd, &avail,
600 &usedobjs, &availobjs);
601 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
602 return (0);
603 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
604 avail = metaslab_class_get_space(spa_normal_class(spa));
605 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
606 if (g_handleattr_off_t(bp, "poolblocksavail",
607 avail / DEV_BSIZE))
608 return (0);
609 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
610 refd = metaslab_class_get_alloc(spa_normal_class(spa));
611 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
612 return (0);
613 }
614 return (1);
615 }
616
617 static void
618 zvol_filter_detach(struct knote *kn)
619 {
620 zvol_state_t *zv;
621 struct zvol_state_dev *zsd;
622
623 zv = kn->kn_hook;
624 zsd = &zv->zv_zso->zso_dev;
625
626 knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0);
627 }
628
629 static int
630 zvol_filter_vnode(struct knote *kn, long hint)
631 {
632 kn->kn_fflags |= kn->kn_sfflags & hint;
633
634 return (kn->kn_fflags != 0);
635 }
636
637 static int
638 zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn)
639 {
640 zvol_state_t *zv;
641 struct zvol_state_dev *zsd;
642
643 zv = dev->si_drv2;
644 zsd = &zv->zv_zso->zso_dev;
645
646 if (kn->kn_filter != EVFILT_VNODE)
647 return (EINVAL);
648
649 /* XXX: extend support for other NOTE_* events */
650 if (kn->kn_sfflags != NOTE_ATTRIB)
651 return (EINVAL);
652
653 kn->kn_fop = &zvol_filterops_vnode;
654 kn->kn_hook = zv;
655 knlist_add(&zsd->zsd_selinfo.si_note, kn, 0);
656
657 return (0);
658 }
659
660 static void
661 zvol_geom_bio_strategy(struct bio *bp)
662 {
663 zvol_state_t *zv;
664 uint64_t off, volsize;
665 size_t resid;
666 char *addr;
667 objset_t *os;
668 zfs_locked_range_t *lr;
669 int error = 0;
670 boolean_t doread = B_FALSE;
671 boolean_t is_dumpified;
672 boolean_t sync;
673
674 if (bp->bio_to)
675 zv = bp->bio_to->private;
676 else
677 zv = bp->bio_dev->si_drv2;
678
679 if (zv == NULL) {
680 error = SET_ERROR(ENXIO);
681 goto out;
682 }
683
684 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
685
686 switch (bp->bio_cmd) {
687 case BIO_READ:
688 doread = B_TRUE;
689 break;
690 case BIO_WRITE:
691 case BIO_FLUSH:
692 case BIO_DELETE:
693 if (zv->zv_flags & ZVOL_RDONLY) {
694 error = SET_ERROR(EROFS);
695 goto resume;
696 }
697 zvol_ensure_zilog(zv);
698 if (bp->bio_cmd == BIO_FLUSH)
699 goto sync;
700 break;
701 default:
702 error = SET_ERROR(EOPNOTSUPP);
703 goto resume;
704 }
705
706 off = bp->bio_offset;
707 volsize = zv->zv_volsize;
708
709 os = zv->zv_objset;
710 ASSERT3P(os, !=, NULL);
711
712 addr = bp->bio_data;
713 resid = bp->bio_length;
714
715 if (resid > 0 && off >= volsize) {
716 error = SET_ERROR(EIO);
717 goto resume;
718 }
719
720 is_dumpified = B_FALSE;
721 sync = !doread && !is_dumpified &&
722 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
723
724 /*
725 * There must be no buffer changes when doing a dmu_sync() because
726 * we can't change the data whilst calculating the checksum.
727 */
728 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
729 doread ? RL_READER : RL_WRITER);
730
731 if (bp->bio_cmd == BIO_DELETE) {
732 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
733 error = dmu_tx_assign(tx, TXG_WAIT);
734 if (error != 0) {
735 dmu_tx_abort(tx);
736 } else {
737 zvol_log_truncate(zv, tx, off, resid, sync);
738 dmu_tx_commit(tx);
739 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
740 off, resid);
741 resid = 0;
742 }
743 goto unlock;
744 }
745 while (resid != 0 && off < volsize) {
746 size_t size = MIN(resid, zvol_maxphys);
747 if (doread) {
748 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
749 DMU_READ_PREFETCH);
750 } else {
751 dmu_tx_t *tx = dmu_tx_create(os);
752 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
753 error = dmu_tx_assign(tx, TXG_WAIT);
754 if (error) {
755 dmu_tx_abort(tx);
756 } else {
757 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
758 zvol_log_write(zv, tx, off, size, sync);
759 dmu_tx_commit(tx);
760 }
761 }
762 if (error) {
763 /* Convert checksum errors into IO errors. */
764 if (error == ECKSUM)
765 error = SET_ERROR(EIO);
766 break;
767 }
768 off += size;
769 addr += size;
770 resid -= size;
771 }
772 unlock:
773 zfs_rangelock_exit(lr);
774
775 bp->bio_completed = bp->bio_length - resid;
776 if (bp->bio_completed < bp->bio_length && off > volsize)
777 error = SET_ERROR(EINVAL);
778
779 switch (bp->bio_cmd) {
780 case BIO_FLUSH:
781 break;
782 case BIO_READ:
783 dataset_kstats_update_read_kstats(&zv->zv_kstat,
784 bp->bio_completed);
785 break;
786 case BIO_WRITE:
787 dataset_kstats_update_write_kstats(&zv->zv_kstat,
788 bp->bio_completed);
789 break;
790 case BIO_DELETE:
791 break;
792 default:
793 break;
794 }
795
796 if (sync) {
797 sync:
798 zil_commit(zv->zv_zilog, ZVOL_OBJ);
799 }
800 resume:
801 rw_exit(&zv->zv_suspend_lock);
802 out:
803 if (bp->bio_to)
804 g_io_deliver(bp, error);
805 else
806 biofinish(bp, NULL, error);
807 }
808
809 /*
810 * Character device mode implementation
811 */
812
813 static int
814 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
815 {
816 zvol_state_t *zv;
817 uint64_t volsize;
818 zfs_locked_range_t *lr;
819 int error = 0;
820 zfs_uio_t uio;
821
822 zfs_uio_init(&uio, uio_s);
823
824 zv = dev->si_drv2;
825
826 volsize = zv->zv_volsize;
827 /*
828 * uio_loffset == volsize isn't an error as
829 * it's required for EOF processing.
830 */
831 if (zfs_uio_resid(&uio) > 0 &&
832 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
833 return (SET_ERROR(EIO));
834
835 ssize_t start_resid = zfs_uio_resid(&uio);
836 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
837 zfs_uio_resid(&uio), RL_READER);
838 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
839 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
840
841 /* Don't read past the end. */
842 if (bytes > volsize - zfs_uio_offset(&uio))
843 bytes = volsize - zfs_uio_offset(&uio);
844
845 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
846 if (error) {
847 /* Convert checksum errors into IO errors. */
848 if (error == ECKSUM)
849 error = SET_ERROR(EIO);
850 break;
851 }
852 }
853 zfs_rangelock_exit(lr);
854 int64_t nread = start_resid - zfs_uio_resid(&uio);
855 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
856
857 return (error);
858 }
859
860 static int
861 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
862 {
863 zvol_state_t *zv;
864 uint64_t volsize;
865 zfs_locked_range_t *lr;
866 int error = 0;
867 boolean_t sync;
868 zfs_uio_t uio;
869
870 zv = dev->si_drv2;
871
872 volsize = zv->zv_volsize;
873
874 zfs_uio_init(&uio, uio_s);
875
876 if (zfs_uio_resid(&uio) > 0 &&
877 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
878 return (SET_ERROR(EIO));
879
880 ssize_t start_resid = zfs_uio_resid(&uio);
881 sync = (ioflag & IO_SYNC) ||
882 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
883
884 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
885 zvol_ensure_zilog(zv);
886
887 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
888 zfs_uio_resid(&uio), RL_WRITER);
889 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
890 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
891 uint64_t off = zfs_uio_offset(&uio);
892 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
893
894 if (bytes > volsize - off) /* Don't write past the end. */
895 bytes = volsize - off;
896
897 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
898 error = dmu_tx_assign(tx, TXG_WAIT);
899 if (error) {
900 dmu_tx_abort(tx);
901 break;
902 }
903 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
904 if (error == 0)
905 zvol_log_write(zv, tx, off, bytes, sync);
906 dmu_tx_commit(tx);
907
908 if (error)
909 break;
910 }
911 zfs_rangelock_exit(lr);
912 int64_t nwritten = start_resid - zfs_uio_resid(&uio);
913 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
914 if (sync)
915 zil_commit(zv->zv_zilog, ZVOL_OBJ);
916 rw_exit(&zv->zv_suspend_lock);
917 return (error);
918 }
919
920 static int
921 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
922 {
923 zvol_state_t *zv;
924 struct zvol_state_dev *zsd;
925 int err = 0;
926 boolean_t drop_suspend = B_FALSE;
927
928 retry:
929 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
930 /*
931 * Obtain a copy of si_drv2 under zvol_state_lock to make sure either
932 * the result of zvol free code setting si_drv2 to NULL is observed,
933 * or the zv is protected from being freed because of the positive
934 * zv_open_count.
935 */
936 zv = dev->si_drv2;
937 if (zv == NULL) {
938 rw_exit(&zvol_state_lock);
939 err = SET_ERROR(ENXIO);
940 goto out_locked;
941 }
942
943 mutex_enter(&zv->zv_state_lock);
944 if (zv->zv_zso->zso_dying) {
945 rw_exit(&zvol_state_lock);
946 err = SET_ERROR(ENXIO);
947 goto out_zv_locked;
948 }
949 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
950
951 /*
952 * Make sure zvol is not suspended during first open
953 * (hold zv_suspend_lock) and respect proper lock acquisition
954 * ordering - zv_suspend_lock before zv_state_lock.
955 */
956 if (zv->zv_open_count == 0) {
957 drop_suspend = B_TRUE;
958 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
959 mutex_exit(&zv->zv_state_lock);
960 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
961 mutex_enter(&zv->zv_state_lock);
962 /* Check to see if zv_suspend_lock is needed. */
963 if (zv->zv_open_count != 0) {
964 rw_exit(&zv->zv_suspend_lock);
965 drop_suspend = B_FALSE;
966 }
967 }
968 }
969 rw_exit(&zvol_state_lock);
970
971 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
972
973 if (zv->zv_open_count == 0) {
974 boolean_t drop_namespace = B_FALSE;
975
976 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
977
978 /*
979 * Take spa_namespace_lock to prevent lock inversion when
980 * zvols from one pool are opened as vdevs in another.
981 */
982 if (!mutex_owned(&spa_namespace_lock)) {
983 if (!mutex_tryenter(&spa_namespace_lock)) {
984 mutex_exit(&zv->zv_state_lock);
985 rw_exit(&zv->zv_suspend_lock);
986 kern_yield(PRI_USER);
987 goto retry;
988 } else {
989 drop_namespace = B_TRUE;
990 }
991 }
992 err = zvol_first_open(zv, !(flags & FWRITE));
993 if (drop_namespace)
994 mutex_exit(&spa_namespace_lock);
995 if (err)
996 goto out_zv_locked;
997 }
998
999 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1000
1001 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
1002 err = SET_ERROR(EROFS);
1003 goto out_opened;
1004 }
1005 if (zv->zv_flags & ZVOL_EXCL) {
1006 err = SET_ERROR(EBUSY);
1007 goto out_opened;
1008 }
1009 if (flags & O_EXCL) {
1010 if (zv->zv_open_count != 0) {
1011 err = SET_ERROR(EBUSY);
1012 goto out_opened;
1013 }
1014 zv->zv_flags |= ZVOL_EXCL;
1015 }
1016
1017 zv->zv_open_count++;
1018 if (flags & O_SYNC) {
1019 zsd = &zv->zv_zso->zso_dev;
1020 zsd->zsd_sync_cnt++;
1021 if (zsd->zsd_sync_cnt == 1 &&
1022 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
1023 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
1024 }
1025 out_opened:
1026 if (zv->zv_open_count == 0) {
1027 zvol_last_close(zv);
1028 wakeup(zv);
1029 }
1030 out_zv_locked:
1031 mutex_exit(&zv->zv_state_lock);
1032 out_locked:
1033 if (drop_suspend)
1034 rw_exit(&zv->zv_suspend_lock);
1035 return (err);
1036 }
1037
1038 static int
1039 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
1040 {
1041 zvol_state_t *zv;
1042 struct zvol_state_dev *zsd;
1043 boolean_t drop_suspend = B_TRUE;
1044
1045 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
1046 zv = dev->si_drv2;
1047 if (zv == NULL) {
1048 rw_exit(&zvol_state_lock);
1049 return (SET_ERROR(ENXIO));
1050 }
1051
1052 mutex_enter(&zv->zv_state_lock);
1053 if (zv->zv_flags & ZVOL_EXCL) {
1054 ASSERT3U(zv->zv_open_count, ==, 1);
1055 zv->zv_flags &= ~ZVOL_EXCL;
1056 }
1057
1058 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
1059
1060 /*
1061 * If the open count is zero, this is a spurious close.
1062 * That indicates a bug in the kernel / DDI framework.
1063 */
1064 ASSERT3U(zv->zv_open_count, >, 0);
1065 /*
1066 * Make sure zvol is not suspended during last close
1067 * (hold zv_suspend_lock) and respect proper lock acquisition
1068 * ordering - zv_suspend_lock before zv_state_lock.
1069 */
1070 if (zv->zv_open_count == 1) {
1071 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
1072 mutex_exit(&zv->zv_state_lock);
1073 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1074 mutex_enter(&zv->zv_state_lock);
1075 /* Check to see if zv_suspend_lock is needed. */
1076 if (zv->zv_open_count != 1) {
1077 rw_exit(&zv->zv_suspend_lock);
1078 drop_suspend = B_FALSE;
1079 }
1080 }
1081 } else {
1082 drop_suspend = B_FALSE;
1083 }
1084 rw_exit(&zvol_state_lock);
1085
1086 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1087
1088 /*
1089 * You may get multiple opens, but only one close.
1090 */
1091 zv->zv_open_count--;
1092 if (flags & O_SYNC) {
1093 zsd = &zv->zv_zso->zso_dev;
1094 zsd->zsd_sync_cnt--;
1095 }
1096
1097 if (zv->zv_open_count == 0) {
1098 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1099 zvol_last_close(zv);
1100 wakeup(zv);
1101 }
1102
1103 mutex_exit(&zv->zv_state_lock);
1104
1105 if (drop_suspend)
1106 rw_exit(&zv->zv_suspend_lock);
1107 return (0);
1108 }
1109
1110 static int
1111 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1112 int fflag, struct thread *td)
1113 {
1114 zvol_state_t *zv;
1115 zfs_locked_range_t *lr;
1116 off_t offset, length;
1117 int error;
1118 boolean_t sync;
1119
1120 zv = dev->si_drv2;
1121
1122 error = 0;
1123 KASSERT(zv->zv_open_count > 0,
1124 ("Device with zero access count in %s", __func__));
1125
1126 switch (cmd) {
1127 case DIOCGSECTORSIZE:
1128 *(uint32_t *)data = DEV_BSIZE;
1129 break;
1130 case DIOCGMEDIASIZE:
1131 *(off_t *)data = zv->zv_volsize;
1132 break;
1133 case DIOCGFLUSH:
1134 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1135 if (zv->zv_zilog != NULL)
1136 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1137 rw_exit(&zv->zv_suspend_lock);
1138 break;
1139 case DIOCGDELETE:
1140 if (!zvol_unmap_enabled)
1141 break;
1142
1143 offset = ((off_t *)data)[0];
1144 length = ((off_t *)data)[1];
1145 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1146 offset < 0 || offset >= zv->zv_volsize ||
1147 length <= 0) {
1148 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1149 length);
1150 error = SET_ERROR(EINVAL);
1151 break;
1152 }
1153 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1154 zvol_ensure_zilog(zv);
1155 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1156 RL_WRITER);
1157 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1158 error = dmu_tx_assign(tx, TXG_WAIT);
1159 if (error != 0) {
1160 sync = FALSE;
1161 dmu_tx_abort(tx);
1162 } else {
1163 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1164 zvol_log_truncate(zv, tx, offset, length, sync);
1165 dmu_tx_commit(tx);
1166 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1167 offset, length);
1168 }
1169 zfs_rangelock_exit(lr);
1170 if (sync)
1171 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1172 rw_exit(&zv->zv_suspend_lock);
1173 break;
1174 case DIOCGSTRIPESIZE:
1175 *(off_t *)data = zv->zv_volblocksize;
1176 break;
1177 case DIOCGSTRIPEOFFSET:
1178 *(off_t *)data = 0;
1179 break;
1180 case DIOCGATTR: {
1181 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1182 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1183 uint64_t refd, avail, usedobjs, availobjs;
1184
1185 if (strcmp(arg->name, "GEOM::candelete") == 0)
1186 arg->value.i = 1;
1187 else if (strcmp(arg->name, "blocksavail") == 0) {
1188 dmu_objset_space(zv->zv_objset, &refd, &avail,
1189 &usedobjs, &availobjs);
1190 arg->value.off = avail / DEV_BSIZE;
1191 } else if (strcmp(arg->name, "blocksused") == 0) {
1192 dmu_objset_space(zv->zv_objset, &refd, &avail,
1193 &usedobjs, &availobjs);
1194 arg->value.off = refd / DEV_BSIZE;
1195 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1196 avail = metaslab_class_get_space(spa_normal_class(spa));
1197 avail -= metaslab_class_get_alloc(
1198 spa_normal_class(spa));
1199 arg->value.off = avail / DEV_BSIZE;
1200 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1201 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1202 arg->value.off = refd / DEV_BSIZE;
1203 } else
1204 error = SET_ERROR(ENOIOCTL);
1205 break;
1206 }
1207 case FIOSEEKHOLE:
1208 case FIOSEEKDATA: {
1209 off_t *off = (off_t *)data;
1210 uint64_t noff;
1211 boolean_t hole;
1212
1213 hole = (cmd == FIOSEEKHOLE);
1214 noff = *off;
1215 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1216 *off = noff;
1217 break;
1218 }
1219 default:
1220 error = SET_ERROR(ENOIOCTL);
1221 }
1222
1223 return (error);
1224 }
1225
1226 /*
1227 * Misc. helpers
1228 */
1229
1230 static void
1231 zvol_ensure_zilog(zvol_state_t *zv)
1232 {
1233 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1234
1235 /*
1236 * Open a ZIL if this is the first time we have written to this
1237 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1238 * than zv_state_lock so that we don't need to acquire an
1239 * additional lock in this path.
1240 */
1241 if (zv->zv_zilog == NULL) {
1242 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1243 rw_exit(&zv->zv_suspend_lock);
1244 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1245 }
1246 if (zv->zv_zilog == NULL) {
1247 zv->zv_zilog = zil_open(zv->zv_objset,
1248 zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1249 zv->zv_flags |= ZVOL_WRITTEN_TO;
1250 /* replay / destroy done in zvol_os_create_minor() */
1251 VERIFY0(zv->zv_zilog->zl_header->zh_flags &
1252 ZIL_REPLAY_NEEDED);
1253 }
1254 rw_downgrade(&zv->zv_suspend_lock);
1255 }
1256 }
1257
1258 boolean_t
1259 zvol_os_is_zvol(const char *device)
1260 {
1261 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1262 }
1263
1264 void
1265 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
1266 {
1267 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1268 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1269
1270 /* Move to a new hashtable entry. */
1271 zv->zv_hash = zvol_name_hash(zv->zv_name);
1272 hlist_del(&zv->zv_hlink);
1273 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1274
1275 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1276 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1277 struct g_provider *pp = zsg->zsg_provider;
1278 struct g_geom *gp;
1279
1280 g_topology_lock();
1281 gp = pp->geom;
1282 ASSERT3P(gp, !=, NULL);
1283
1284 zsg->zsg_provider = NULL;
1285 g_wither_provider(pp, ENXIO);
1286
1287 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1288 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1289 pp->sectorsize = DEV_BSIZE;
1290 pp->mediasize = zv->zv_volsize;
1291 pp->private = zv;
1292 zsg->zsg_provider = pp;
1293 g_error_provider(pp, 0);
1294 g_topology_unlock();
1295 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1296 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1297 struct cdev *dev;
1298 struct make_dev_args args;
1299
1300 dev = zsd->zsd_cdev;
1301 if (dev != NULL) {
1302 destroy_dev(dev);
1303 dev = zsd->zsd_cdev = NULL;
1304 if (zv->zv_open_count > 0) {
1305 zv->zv_flags &= ~ZVOL_EXCL;
1306 zv->zv_open_count = 0;
1307 /* XXX need suspend lock but lock order */
1308 zvol_last_close(zv);
1309 }
1310 }
1311
1312 make_dev_args_init(&args);
1313 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1314 args.mda_devsw = &zvol_cdevsw;
1315 args.mda_cr = NULL;
1316 args.mda_uid = UID_ROOT;
1317 args.mda_gid = GID_OPERATOR;
1318 args.mda_mode = 0640;
1319 args.mda_si_drv2 = zv;
1320 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1321 == 0) {
1322 #if __FreeBSD_version > 1300130
1323 dev->si_iosize_max = maxphys;
1324 #else
1325 dev->si_iosize_max = MAXPHYS;
1326 #endif
1327 zsd->zsd_cdev = dev;
1328 }
1329 }
1330 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1331 }
1332
1333 /*
1334 * Remove minor node for the specified volume.
1335 */
1336 void
1337 zvol_os_free(zvol_state_t *zv)
1338 {
1339 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1340 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1341 ASSERT0(zv->zv_open_count);
1342
1343 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1344
1345 rw_destroy(&zv->zv_suspend_lock);
1346 zfs_rangelock_fini(&zv->zv_rangelock);
1347
1348 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1349 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1350 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1351
1352 ASSERT3P(pp->private, ==, NULL);
1353
1354 g_topology_lock();
1355 zvol_geom_destroy(zv);
1356 g_topology_unlock();
1357 mtx_destroy(&zsg->zsg_queue_mtx);
1358 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1359 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1360 struct cdev *dev = zsd->zsd_cdev;
1361
1362 if (dev != NULL) {
1363 ASSERT3P(dev->si_drv2, ==, NULL);
1364 destroy_dev(dev);
1365 knlist_clear(&zsd->zsd_selinfo.si_note, 0);
1366 knlist_destroy(&zsd->zsd_selinfo.si_note);
1367 }
1368 }
1369
1370 mutex_destroy(&zv->zv_state_lock);
1371 dataset_kstats_destroy(&zv->zv_kstat);
1372 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1373 kmem_free(zv, sizeof (zvol_state_t));
1374 zvol_minors--;
1375 }
1376
1377 /*
1378 * Create a minor node (plus a whole lot more) for the specified volume.
1379 */
1380 int
1381 zvol_os_create_minor(const char *name)
1382 {
1383 zvol_state_t *zv;
1384 objset_t *os;
1385 dmu_object_info_t *doi;
1386 uint64_t volsize;
1387 uint64_t volmode, hash;
1388 int error;
1389 bool replayed_zil = B_FALSE;
1390
1391 ZFS_LOG(1, "Creating ZVOL %s...", name);
1392 hash = zvol_name_hash(name);
1393 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1394 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1395 mutex_exit(&zv->zv_state_lock);
1396 return (SET_ERROR(EEXIST));
1397 }
1398
1399 DROP_GIANT();
1400
1401 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1402
1403 /* Lie and say we're read-only. */
1404 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1405 if (error)
1406 goto out_doi;
1407
1408 error = dmu_object_info(os, ZVOL_OBJ, doi);
1409 if (error)
1410 goto out_dmu_objset_disown;
1411
1412 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1413 if (error)
1414 goto out_dmu_objset_disown;
1415
1416 error = dsl_prop_get_integer(name,
1417 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1418 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1419 volmode = zvol_volmode;
1420 error = 0;
1421
1422 /*
1423 * zvol_alloc equivalent ...
1424 */
1425 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1426 zv->zv_hash = hash;
1427 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1428 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1429 zv->zv_volmode = volmode;
1430 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1431 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1432 struct g_provider *pp;
1433 struct g_geom *gp;
1434
1435 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1436 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1437
1438 g_topology_lock();
1439 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1440 gp->start = zvol_geom_bio_start;
1441 gp->access = zvol_geom_access;
1442 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1443 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1444 pp->sectorsize = DEV_BSIZE;
1445 pp->mediasize = 0;
1446 pp->private = zv;
1447
1448 zsg->zsg_provider = pp;
1449 bioq_init(&zsg->zsg_queue);
1450 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1451 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1452 struct cdev *dev;
1453 struct make_dev_args args;
1454
1455 make_dev_args_init(&args);
1456 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1457 args.mda_devsw = &zvol_cdevsw;
1458 args.mda_cr = NULL;
1459 args.mda_uid = UID_ROOT;
1460 args.mda_gid = GID_OPERATOR;
1461 args.mda_mode = 0640;
1462 args.mda_si_drv2 = zv;
1463 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name)
1464 == 0) {
1465 #if __FreeBSD_version > 1300130
1466 dev->si_iosize_max = maxphys;
1467 #else
1468 dev->si_iosize_max = MAXPHYS;
1469 #endif
1470 zsd->zsd_cdev = dev;
1471 knlist_init_sx(&zsd->zsd_selinfo.si_note,
1472 &zv->zv_state_lock);
1473 }
1474 }
1475 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1476 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1477 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1478
1479 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1480 zv->zv_flags |= ZVOL_RDONLY;
1481
1482 zv->zv_volblocksize = doi->doi_data_block_size;
1483 zv->zv_volsize = volsize;
1484 zv->zv_objset = os;
1485
1486 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1487 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1488 if (error)
1489 goto out_dmu_objset_disown;
1490 ASSERT3P(zv->zv_zilog, ==, NULL);
1491 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
1492 if (spa_writeable(dmu_objset_spa(os))) {
1493 if (zil_replay_disable)
1494 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
1495 else
1496 replayed_zil = zil_replay(os, zv, zvol_replay_vector);
1497 }
1498 if (replayed_zil)
1499 zil_close(zv->zv_zilog);
1500 zv->zv_zilog = NULL;
1501
1502 /* TODO: prefetch for geom tasting */
1503
1504 zv->zv_objset = NULL;
1505 out_dmu_objset_disown:
1506 dmu_objset_disown(os, B_TRUE, FTAG);
1507
1508 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1509 zvol_geom_run(zv);
1510 g_topology_unlock();
1511 }
1512 out_doi:
1513 kmem_free(doi, sizeof (dmu_object_info_t));
1514 if (error == 0) {
1515 rw_enter(&zvol_state_lock, RW_WRITER);
1516 zvol_insert(zv);
1517 zvol_minors++;
1518 rw_exit(&zvol_state_lock);
1519 ZFS_LOG(1, "ZVOL %s created.", name);
1520 }
1521 PICKUP_GIANT();
1522 return (error);
1523 }
1524
1525 void
1526 zvol_os_clear_private(zvol_state_t *zv)
1527 {
1528 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1529 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1530 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1531 struct g_provider *pp = zsg->zsg_provider;
1532
1533 if (pp->private == NULL) /* already cleared */
1534 return;
1535
1536 mtx_lock(&zsg->zsg_queue_mtx);
1537 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1538 pp->private = NULL;
1539 wakeup_one(&zsg->zsg_queue);
1540 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1541 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1542 0, "zvol:w", 0);
1543 mtx_unlock(&zsg->zsg_queue_mtx);
1544 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1545 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1546 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1547 struct cdev *dev = zsd->zsd_cdev;
1548
1549 if (dev != NULL)
1550 dev->si_drv2 = NULL;
1551 }
1552 }
1553
1554 int
1555 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
1556 {
1557 zv->zv_volsize = volsize;
1558 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1559 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1560 struct g_provider *pp = zsg->zsg_provider;
1561
1562 g_topology_lock();
1563
1564 if (pp->private == NULL) {
1565 g_topology_unlock();
1566 return (SET_ERROR(ENXIO));
1567 }
1568
1569 /*
1570 * Do not invoke resize event when initial size was zero.
1571 * ZVOL initializes the size on first open, this is not
1572 * real resizing.
1573 */
1574 if (pp->mediasize == 0)
1575 pp->mediasize = zv->zv_volsize;
1576 else
1577 g_resize_provider(pp, zv->zv_volsize);
1578
1579 g_topology_unlock();
1580 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1581 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1582
1583 KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB);
1584 }
1585 return (0);
1586 }
1587
1588 void
1589 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
1590 {
1591 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1592 }
1593
1594 void
1595 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
1596 {
1597 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1598 }
1599
1600 /*
1601 * Public interfaces
1602 */
1603
1604 int
1605 zvol_busy(void)
1606 {
1607 return (zvol_minors != 0);
1608 }
1609
1610 int
1611 zvol_init(void)
1612 {
1613 zvol_init_impl();
1614 return (0);
1615 }
1616
1617 void
1618 zvol_fini(void)
1619 {
1620 zvol_fini_impl();
1621 }
Cache object: c122bd814a2835079fa98320ee0e504b
|