4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25 * All rights reserved.
27 * Portions Copyright 2010 Robert Milkowski
29 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
30 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32 * Copyright (c) 2014 Integros [integros.com]
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
38 * ZFS volume emulation driver.
40 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41 * Volumes are accessed through the symbolic links named:
43 * /dev/zvol/<pool_name>/<dataset_name>
45 * Volumes are persistent through reboot. No user command needs to be
46 * run before opening and using a device.
48 * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49 * in the system. Except when they're simply character devices (volmode=dev).
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
61 #include <sys/cmn_err.h>
66 #include <sys/spa_impl.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
96 #include <geom/geom.h>
98 #include <sys/zvol_impl.h>
100 #include "zfs_namecheck.h"
102 #define ZVOL_DUMPSIZE "dumpsize"
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER RW_WRITER
106 #define ZVOL_RW_READ_HELD RW_WRITE_HELD
108 #define ZVOL_RW_READER RW_READER
109 #define ZVOL_RW_READ_HELD RW_READ_HELD
112 enum zvol_geom_state {
118 struct zvol_state_os {
119 #define zso_dev _zso_state._zso_dev
120 #define zso_geom _zso_state._zso_geom
123 struct zvol_state_dev {
124 struct cdev *zsd_cdev;
125 uint64_t zsd_sync_cnt;
129 struct zvol_state_geom {
130 struct g_provider *zsg_provider;
131 struct bio_queue_head zsg_queue;
132 struct mtx zsg_queue_mtx;
133 enum zvol_geom_state zsg_state;
139 static uint32_t zvol_minors;
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144 "Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147 "Allow zpools to use zvols as vdevs (DANGEROUS)");
150 * Toggle unmap functionality.
152 boolean_t zvol_unmap_enabled = B_TRUE;
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155 &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
158 * zvol maximum transfer in one DMU tx.
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
162 static void zvol_ensure_zilog(zvol_state_t *zv);
164 static d_open_t zvol_cdev_open;
165 static d_close_t zvol_cdev_close;
166 static d_ioctl_t zvol_cdev_ioctl;
167 static d_read_t zvol_cdev_read;
168 static d_write_t zvol_cdev_write;
169 static d_strategy_t zvol_geom_bio_strategy;
171 static struct cdevsw zvol_cdevsw = {
173 .d_version = D_VERSION,
174 .d_flags = D_DISK | D_TRACKCLOSE,
175 .d_open = zvol_cdev_open,
176 .d_close = zvol_cdev_close,
177 .d_ioctl = zvol_cdev_ioctl,
178 .d_read = zvol_cdev_read,
179 .d_write = zvol_cdev_write,
180 .d_strategy = zvol_geom_bio_strategy,
183 extern uint_t zfs_geom_probe_vdev_key;
185 struct g_class zfs_zvol_class = {
187 .version = G_VERSION,
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
203 * GEOM mode implementation
208 zvol_geom_open(struct g_provider *pp, int flag, int count)
212 boolean_t drop_suspend = B_FALSE;
213 boolean_t drop_namespace = B_FALSE;
215 if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
217 * if zfs_geom_probe_vdev_key is set, that means that zfs is
218 * attempting to probe geom providers while looking for a
219 * replacement for a missing VDEV. In this case, the
220 * spa_namespace_lock will not be held, but it is still illegal
221 * to use a zvol as a vdev. Deadlocks can result if another
222 * thread has spa_namespace_lock
224 return (SET_ERROR(EOPNOTSUPP));
228 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
231 rw_exit(&zvol_state_lock);
232 err = SET_ERROR(ENXIO);
236 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
238 * We need to guarantee that the namespace lock is held
239 * to avoid spurious failures in zvol_first_open.
241 drop_namespace = B_TRUE;
242 if (!mutex_tryenter(&spa_namespace_lock)) {
243 rw_exit(&zvol_state_lock);
244 mutex_enter(&spa_namespace_lock);
248 mutex_enter(&zv->zv_state_lock);
249 if (zv->zv_zso->zso_dying) {
250 rw_exit(&zvol_state_lock);
251 err = SET_ERROR(ENXIO);
254 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
257 * make sure zvol is not suspended during first open
258 * (hold zv_suspend_lock) and respect proper lock acquisition
259 * ordering - zv_suspend_lock before zv_state_lock
261 if (zv->zv_open_count == 0) {
262 drop_suspend = B_TRUE;
263 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
264 mutex_exit(&zv->zv_state_lock);
265 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
266 mutex_enter(&zv->zv_state_lock);
267 /* check to see if zv_suspend_lock is needed */
268 if (zv->zv_open_count != 0) {
269 rw_exit(&zv->zv_suspend_lock);
270 drop_suspend = B_FALSE;
274 rw_exit(&zvol_state_lock);
276 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
278 if (zv->zv_open_count == 0) {
279 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280 err = zvol_first_open(zv, !(flag & FWRITE));
283 pp->mediasize = zv->zv_volsize;
284 pp->stripeoffset = 0;
285 pp->stripesize = zv->zv_volblocksize;
289 * Check for a bad on-disk format version now since we
290 * lied about owning the dataset readonly before.
292 if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
293 dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
294 err = SET_ERROR(EROFS);
297 if (zv->zv_flags & ZVOL_EXCL) {
298 err = SET_ERROR(EBUSY);
303 if (zv->zv_open_count != 0) {
304 err = SET_ERROR(EBUSY);
307 zv->zv_flags |= ZVOL_EXCL;
311 zv->zv_open_count += count;
313 if (zv->zv_open_count == 0) {
318 mutex_exit(&zv->zv_state_lock);
321 mutex_exit(&spa_namespace_lock);
323 rw_exit(&zv->zv_suspend_lock);
329 zvol_geom_close(struct g_provider *pp, int flag, int count)
332 boolean_t drop_suspend = B_TRUE;
335 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
338 rw_exit(&zvol_state_lock);
339 return (SET_ERROR(ENXIO));
342 mutex_enter(&zv->zv_state_lock);
343 if (zv->zv_flags & ZVOL_EXCL) {
344 ASSERT3U(zv->zv_open_count, ==, 1);
345 zv->zv_flags &= ~ZVOL_EXCL;
348 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
351 * If the open count is zero, this is a spurious close.
352 * That indicates a bug in the kernel / DDI framework.
354 ASSERT3U(zv->zv_open_count, >, 0);
357 * make sure zvol is not suspended during last close
358 * (hold zv_suspend_lock) and respect proper lock acquisition
359 * ordering - zv_suspend_lock before zv_state_lock
361 new_open_count = zv->zv_open_count - count;
362 if (new_open_count == 0) {
363 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364 mutex_exit(&zv->zv_state_lock);
365 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366 mutex_enter(&zv->zv_state_lock);
367 /* check to see if zv_suspend_lock is needed */
368 new_open_count = zv->zv_open_count - count;
369 if (new_open_count != 0) {
370 rw_exit(&zv->zv_suspend_lock);
371 drop_suspend = B_FALSE;
375 drop_suspend = B_FALSE;
377 rw_exit(&zvol_state_lock);
379 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
382 * You may get multiple opens, but only one close.
384 zv->zv_open_count = new_open_count;
385 if (zv->zv_open_count == 0) {
386 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
391 mutex_exit(&zv->zv_state_lock);
394 rw_exit(&zv->zv_suspend_lock);
399 zvol_geom_run(zvol_state_t *zv)
401 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
402 struct g_provider *pp = zsg->zsg_provider;
404 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
406 g_error_provider(pp, 0);
408 kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
409 "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
413 zvol_geom_destroy(zvol_state_t *zv)
415 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
416 struct g_provider *pp = zsg->zsg_provider;
418 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
422 mutex_enter(&zv->zv_state_lock);
423 VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
424 mutex_exit(&zv->zv_state_lock);
425 zsg->zsg_provider = NULL;
426 g_wither_geom(pp->geom, ENXIO);
430 zvol_wait_close(zvol_state_t *zv)
433 if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
435 mutex_enter(&zv->zv_state_lock);
436 zv->zv_zso->zso_dying = B_TRUE;
438 if (zv->zv_open_count)
439 msleep(zv, &zv->zv_state_lock,
440 PRIBIO, "zvol:dying", 10*hz);
441 mutex_exit(&zv->zv_state_lock);
446 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
448 int count, error, flags;
453 * To make it easier we expect either open or close, but not both
456 KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
457 (acr <= 0 && acw <= 0 && ace <= 0),
458 ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
459 pp->name, acr, acw, ace));
461 if (pp->private == NULL) {
462 if (acr <= 0 && acw <= 0 && ace <= 0)
468 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
469 * ace != 0, because GEOM already handles that and handles it a bit
470 * differently. GEOM allows for multiple read/exclusive consumers and
471 * ZFS allows only one exclusive consumer, no matter if it is reader or
472 * writer. I like better the way GEOM works so I'll leave it for GEOM
473 * to decide what to do.
476 count = acr + acw + ace;
481 if (acr != 0 || ace != 0)
488 error = zvol_geom_open(pp, flags, count);
490 error = zvol_geom_close(pp, flags, -count);
496 zvol_geom_worker(void *arg)
498 zvol_state_t *zv = arg;
499 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
502 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
504 thread_lock(curthread);
505 sched_prio(curthread, PRIBIO);
506 thread_unlock(curthread);
509 mtx_lock(&zsg->zsg_queue_mtx);
510 bp = bioq_takefirst(&zsg->zsg_queue);
512 if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
513 zsg->zsg_state = ZVOL_GEOM_RUNNING;
514 wakeup(&zsg->zsg_state);
515 mtx_unlock(&zsg->zsg_queue_mtx);
518 msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
519 PRIBIO | PDROP, "zvol:io", 0);
522 mtx_unlock(&zsg->zsg_queue_mtx);
523 zvol_geom_bio_strategy(bp);
528 zvol_geom_bio_start(struct bio *bp)
530 zvol_state_t *zv = bp->bio_to->private;
531 struct zvol_state_geom *zsg;
535 g_io_deliver(bp, ENXIO);
538 if (bp->bio_cmd == BIO_GETATTR) {
539 if (zvol_geom_bio_getattr(bp))
540 g_io_deliver(bp, EOPNOTSUPP);
544 if (!THREAD_CAN_SLEEP()) {
545 zsg = &zv->zv_zso->zso_geom;
546 mtx_lock(&zsg->zsg_queue_mtx);
547 first = (bioq_first(&zsg->zsg_queue) == NULL);
548 bioq_insert_tail(&zsg->zsg_queue, bp);
549 mtx_unlock(&zsg->zsg_queue_mtx);
551 wakeup_one(&zsg->zsg_queue);
555 zvol_geom_bio_strategy(bp);
559 zvol_geom_bio_getattr(struct bio *bp)
563 zv = bp->bio_to->private;
564 ASSERT3P(zv, !=, NULL);
566 spa_t *spa = dmu_objset_spa(zv->zv_objset);
567 uint64_t refd, avail, usedobjs, availobjs;
569 if (g_handleattr_int(bp, "GEOM::candelete", 1))
571 if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
572 dmu_objset_space(zv->zv_objset, &refd, &avail,
573 &usedobjs, &availobjs);
574 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
576 } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
577 dmu_objset_space(zv->zv_objset, &refd, &avail,
578 &usedobjs, &availobjs);
579 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
581 } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
582 avail = metaslab_class_get_space(spa_normal_class(spa));
583 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
584 if (g_handleattr_off_t(bp, "poolblocksavail",
587 } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
588 refd = metaslab_class_get_alloc(spa_normal_class(spa));
589 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
596 zvol_geom_bio_strategy(struct bio *bp)
599 uint64_t off, volsize;
603 zfs_locked_range_t *lr;
605 boolean_t doread = B_FALSE;
606 boolean_t is_dumpified;
610 zv = bp->bio_to->private;
612 zv = bp->bio_dev->si_drv2;
615 error = SET_ERROR(ENXIO);
619 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
621 switch (bp->bio_cmd) {
628 if (zv->zv_flags & ZVOL_RDONLY) {
629 error = SET_ERROR(EROFS);
632 zvol_ensure_zilog(zv);
633 if (bp->bio_cmd == BIO_FLUSH)
637 error = SET_ERROR(EOPNOTSUPP);
641 off = bp->bio_offset;
642 volsize = zv->zv_volsize;
645 ASSERT3P(os, !=, NULL);
648 resid = bp->bio_length;
650 if (resid > 0 && off >= volsize) {
651 error = SET_ERROR(EIO);
655 is_dumpified = B_FALSE;
656 sync = !doread && !is_dumpified &&
657 zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
660 * There must be no buffer changes when doing a dmu_sync() because
661 * we can't change the data whilst calculating the checksum.
663 lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
664 doread ? RL_READER : RL_WRITER);
666 if (bp->bio_cmd == BIO_DELETE) {
667 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
668 error = dmu_tx_assign(tx, TXG_WAIT);
672 zvol_log_truncate(zv, tx, off, resid, sync);
674 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
680 while (resid != 0 && off < volsize) {
681 size_t size = MIN(resid, zvol_maxphys);
683 error = dmu_read(os, ZVOL_OBJ, off, size, addr,
686 dmu_tx_t *tx = dmu_tx_create(os);
687 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
688 error = dmu_tx_assign(tx, TXG_WAIT);
692 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
693 zvol_log_write(zv, tx, off, size, sync);
698 /* convert checksum errors into IO errors */
700 error = SET_ERROR(EIO);
708 zfs_rangelock_exit(lr);
710 bp->bio_completed = bp->bio_length - resid;
711 if (bp->bio_completed < bp->bio_length && off > volsize)
712 error = SET_ERROR(EINVAL);
714 switch (bp->bio_cmd) {
718 dataset_kstats_update_read_kstats(&zv->zv_kstat,
722 dataset_kstats_update_write_kstats(&zv->zv_kstat,
733 zil_commit(zv->zv_zilog, ZVOL_OBJ);
736 rw_exit(&zv->zv_suspend_lock);
739 g_io_deliver(bp, error);
741 biofinish(bp, NULL, error);
745 * Character device mode implementation
749 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
753 zfs_locked_range_t *lr;
757 zfs_uio_init(&uio, uio_s);
761 volsize = zv->zv_volsize;
763 * uio_loffset == volsize isn't an error as
764 * its required for EOF processing.
766 if (zfs_uio_resid(&uio) > 0 &&
767 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
768 return (SET_ERROR(EIO));
770 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
771 zfs_uio_resid(&uio), RL_READER);
772 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
773 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
775 /* don't read past the end */
776 if (bytes > volsize - zfs_uio_offset(&uio))
777 bytes = volsize - zfs_uio_offset(&uio);
779 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
781 /* convert checksum errors into IO errors */
783 error = SET_ERROR(EIO);
787 zfs_rangelock_exit(lr);
793 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
797 zfs_locked_range_t *lr;
804 volsize = zv->zv_volsize;
806 zfs_uio_init(&uio, uio_s);
808 if (zfs_uio_resid(&uio) > 0 &&
809 (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
810 return (SET_ERROR(EIO));
812 sync = (ioflag & IO_SYNC) ||
813 (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
815 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
816 zvol_ensure_zilog(zv);
818 lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
819 zfs_uio_resid(&uio), RL_WRITER);
820 while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
821 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
822 uint64_t off = zfs_uio_offset(&uio);
823 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
825 if (bytes > volsize - off) /* don't write past the end */
826 bytes = volsize - off;
828 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
829 error = dmu_tx_assign(tx, TXG_WAIT);
834 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
836 zvol_log_write(zv, tx, off, bytes, sync);
842 zfs_rangelock_exit(lr);
844 zil_commit(zv->zv_zilog, ZVOL_OBJ);
845 rw_exit(&zv->zv_suspend_lock);
850 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
853 struct zvol_state_dev *zsd;
855 boolean_t drop_suspend = B_FALSE;
856 boolean_t drop_namespace = B_FALSE;
859 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
862 rw_exit(&zvol_state_lock);
863 err = SET_ERROR(ENXIO);
867 if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
869 * We need to guarantee that the namespace lock is held
870 * to avoid spurious failures in zvol_first_open.
872 drop_namespace = B_TRUE;
873 if (!mutex_tryenter(&spa_namespace_lock)) {
874 rw_exit(&zvol_state_lock);
875 mutex_enter(&spa_namespace_lock);
879 mutex_enter(&zv->zv_state_lock);
881 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
884 * make sure zvol is not suspended during first open
885 * (hold zv_suspend_lock) and respect proper lock acquisition
886 * ordering - zv_suspend_lock before zv_state_lock
888 if (zv->zv_open_count == 0) {
889 drop_suspend = B_TRUE;
890 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
891 mutex_exit(&zv->zv_state_lock);
892 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893 mutex_enter(&zv->zv_state_lock);
894 /* check to see if zv_suspend_lock is needed */
895 if (zv->zv_open_count != 0) {
896 rw_exit(&zv->zv_suspend_lock);
897 drop_suspend = B_FALSE;
901 rw_exit(&zvol_state_lock);
903 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
905 if (zv->zv_open_count == 0) {
906 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
907 err = zvol_first_open(zv, !(flags & FWRITE));
912 if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
913 err = SET_ERROR(EROFS);
916 if (zv->zv_flags & ZVOL_EXCL) {
917 err = SET_ERROR(EBUSY);
922 if (zv->zv_open_count != 0) {
923 err = SET_ERROR(EBUSY);
926 zv->zv_flags |= ZVOL_EXCL;
931 if (flags & (FSYNC | FDSYNC)) {
932 zsd = &zv->zv_zso->zso_dev;
934 if (zsd->zsd_sync_cnt == 1 &&
935 (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
936 zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
939 if (zv->zv_open_count == 0) {
944 mutex_exit(&zv->zv_state_lock);
947 mutex_exit(&spa_namespace_lock);
949 rw_exit(&zv->zv_suspend_lock);
954 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
957 struct zvol_state_dev *zsd;
958 boolean_t drop_suspend = B_TRUE;
960 rw_enter(&zvol_state_lock, ZVOL_RW_READER);
963 rw_exit(&zvol_state_lock);
964 return (SET_ERROR(ENXIO));
967 mutex_enter(&zv->zv_state_lock);
968 if (zv->zv_flags & ZVOL_EXCL) {
969 ASSERT3U(zv->zv_open_count, ==, 1);
970 zv->zv_flags &= ~ZVOL_EXCL;
973 ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
976 * If the open count is zero, this is a spurious close.
977 * That indicates a bug in the kernel / DDI framework.
979 ASSERT3U(zv->zv_open_count, >, 0);
981 * make sure zvol is not suspended during last close
982 * (hold zv_suspend_lock) and respect proper lock acquisition
983 * ordering - zv_suspend_lock before zv_state_lock
985 if (zv->zv_open_count == 1) {
986 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
987 mutex_exit(&zv->zv_state_lock);
988 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
989 mutex_enter(&zv->zv_state_lock);
990 /* check to see if zv_suspend_lock is needed */
991 if (zv->zv_open_count != 1) {
992 rw_exit(&zv->zv_suspend_lock);
993 drop_suspend = B_FALSE;
997 drop_suspend = B_FALSE;
999 rw_exit(&zvol_state_lock);
1001 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1004 * You may get multiple opens, but only one close.
1006 zv->zv_open_count--;
1007 if (flags & (FSYNC | FDSYNC)) {
1008 zsd = &zv->zv_zso->zso_dev;
1009 zsd->zsd_sync_cnt--;
1012 if (zv->zv_open_count == 0) {
1013 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1014 zvol_last_close(zv);
1018 mutex_exit(&zv->zv_state_lock);
1021 rw_exit(&zv->zv_suspend_lock);
1026 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1027 int fflag, struct thread *td)
1030 zfs_locked_range_t *lr;
1031 off_t offset, length;
1038 KASSERT(zv->zv_open_count > 0,
1039 ("Device with zero access count in %s", __func__));
1041 i = IOCPARM_LEN(cmd);
1043 case DIOCGSECTORSIZE:
1044 *(uint32_t *)data = DEV_BSIZE;
1046 case DIOCGMEDIASIZE:
1047 *(off_t *)data = zv->zv_volsize;
1050 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1051 if (zv->zv_zilog != NULL)
1052 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1053 rw_exit(&zv->zv_suspend_lock);
1056 if (!zvol_unmap_enabled)
1059 offset = ((off_t *)data)[0];
1060 length = ((off_t *)data)[1];
1061 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1062 offset < 0 || offset >= zv->zv_volsize ||
1064 printf("%s: offset=%jd length=%jd\n", __func__, offset,
1066 error = SET_ERROR(EINVAL);
1069 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1070 zvol_ensure_zilog(zv);
1071 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1073 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1074 error = dmu_tx_assign(tx, TXG_WAIT);
1079 sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1080 zvol_log_truncate(zv, tx, offset, length, sync);
1082 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1085 zfs_rangelock_exit(lr);
1087 zil_commit(zv->zv_zilog, ZVOL_OBJ);
1088 rw_exit(&zv->zv_suspend_lock);
1090 case DIOCGSTRIPESIZE:
1091 *(off_t *)data = zv->zv_volblocksize;
1093 case DIOCGSTRIPEOFFSET:
1097 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1098 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1099 uint64_t refd, avail, usedobjs, availobjs;
1101 if (strcmp(arg->name, "GEOM::candelete") == 0)
1103 else if (strcmp(arg->name, "blocksavail") == 0) {
1104 dmu_objset_space(zv->zv_objset, &refd, &avail,
1105 &usedobjs, &availobjs);
1106 arg->value.off = avail / DEV_BSIZE;
1107 } else if (strcmp(arg->name, "blocksused") == 0) {
1108 dmu_objset_space(zv->zv_objset, &refd, &avail,
1109 &usedobjs, &availobjs);
1110 arg->value.off = refd / DEV_BSIZE;
1111 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1112 avail = metaslab_class_get_space(spa_normal_class(spa));
1113 avail -= metaslab_class_get_alloc(
1114 spa_normal_class(spa));
1115 arg->value.off = avail / DEV_BSIZE;
1116 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1117 refd = metaslab_class_get_alloc(spa_normal_class(spa));
1118 arg->value.off = refd / DEV_BSIZE;
1120 error = SET_ERROR(ENOIOCTL);
1125 off_t *off = (off_t *)data;
1129 hole = (cmd == FIOSEEKHOLE);
1131 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1136 error = SET_ERROR(ENOIOCTL);
1147 zvol_ensure_zilog(zvol_state_t *zv)
1149 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1152 * Open a ZIL if this is the first time we have written to this
1153 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1154 * than zv_state_lock so that we don't need to acquire an
1155 * additional lock in this path.
1157 if (zv->zv_zilog == NULL) {
1158 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1159 rw_exit(&zv->zv_suspend_lock);
1160 rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1162 if (zv->zv_zilog == NULL) {
1163 zv->zv_zilog = zil_open(zv->zv_objset,
1165 zv->zv_flags |= ZVOL_WRITTEN_TO;
1166 /* replay / destroy done in zvol_create_minor_impl() */
1167 VERIFY0((zv->zv_zilog->zl_header->zh_flags &
1168 ZIL_REPLAY_NEEDED));
1170 rw_downgrade(&zv->zv_suspend_lock);
1175 zvol_is_zvol_impl(const char *device)
1177 return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1181 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1183 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1184 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1186 /* move to new hashtable entry */
1187 zv->zv_hash = zvol_name_hash(zv->zv_name);
1188 hlist_del(&zv->zv_hlink);
1189 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1191 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1192 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1193 struct g_provider *pp = zsg->zsg_provider;
1198 ASSERT3P(gp, !=, NULL);
1200 zsg->zsg_provider = NULL;
1201 g_wither_provider(pp, ENXIO);
1203 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1204 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1205 pp->sectorsize = DEV_BSIZE;
1206 pp->mediasize = zv->zv_volsize;
1208 zsg->zsg_provider = pp;
1209 g_error_provider(pp, 0);
1210 g_topology_unlock();
1211 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1212 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1214 struct make_dev_args args;
1216 dev = zsd->zsd_cdev;
1219 dev = zsd->zsd_cdev = NULL;
1220 if (zv->zv_open_count > 0) {
1221 zv->zv_flags &= ~ZVOL_EXCL;
1222 zv->zv_open_count = 0;
1223 /* XXX need suspend lock but lock order */
1224 zvol_last_close(zv);
1228 make_dev_args_init(&args);
1229 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1230 args.mda_devsw = &zvol_cdevsw;
1232 args.mda_uid = UID_ROOT;
1233 args.mda_gid = GID_OPERATOR;
1234 args.mda_mode = 0640;
1235 args.mda_si_drv2 = zv;
1236 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1238 dev->si_iosize_max = maxphys;
1239 zsd->zsd_cdev = dev;
1242 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1246 * Remove minor node for the specified volume.
1249 zvol_free(zvol_state_t *zv)
1251 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1252 ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1253 ASSERT0(zv->zv_open_count);
1255 ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1257 rw_destroy(&zv->zv_suspend_lock);
1258 zfs_rangelock_fini(&zv->zv_rangelock);
1260 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1261 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1262 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1264 ASSERT3P(pp->private, ==, NULL);
1267 zvol_geom_destroy(zv);
1268 g_topology_unlock();
1269 mtx_destroy(&zsg->zsg_queue_mtx);
1270 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1271 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1272 struct cdev *dev = zsd->zsd_cdev;
1274 ASSERT3P(dev->si_drv2, ==, NULL);
1279 mutex_destroy(&zv->zv_state_lock);
1280 dataset_kstats_destroy(&zv->zv_kstat);
1281 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1282 kmem_free(zv, sizeof (zvol_state_t));
1287 * Create a minor node (plus a whole lot more) for the specified volume.
1290 zvol_create_minor_impl(const char *name)
1294 dmu_object_info_t *doi;
1296 uint64_t volmode, hash;
1299 ZFS_LOG(1, "Creating ZVOL %s...", name);
1300 hash = zvol_name_hash(name);
1301 if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1302 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1303 mutex_exit(&zv->zv_state_lock);
1304 return (SET_ERROR(EEXIST));
1309 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1311 /* lie and say we're read-only */
1312 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1316 error = dmu_object_info(os, ZVOL_OBJ, doi);
1318 goto out_dmu_objset_disown;
1320 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1322 goto out_dmu_objset_disown;
1324 error = dsl_prop_get_integer(name,
1325 zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1326 if (error || volmode == ZFS_VOLMODE_DEFAULT)
1327 volmode = zvol_volmode;
1331 * zvol_alloc equivalent ...
1333 zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1335 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1336 zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1337 zv->zv_volmode = volmode;
1338 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1339 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1340 struct g_provider *pp;
1343 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1344 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1347 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1348 gp->start = zvol_geom_bio_start;
1349 gp->access = zvol_geom_access;
1350 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1351 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1352 pp->sectorsize = DEV_BSIZE;
1356 zsg->zsg_provider = pp;
1357 bioq_init(&zsg->zsg_queue);
1358 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1359 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1361 struct make_dev_args args;
1363 make_dev_args_init(&args);
1364 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1365 args.mda_devsw = &zvol_cdevsw;
1367 args.mda_uid = UID_ROOT;
1368 args.mda_gid = GID_OPERATOR;
1369 args.mda_mode = 0640;
1370 args.mda_si_drv2 = zv;
1371 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1373 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1374 mutex_destroy(&zv->zv_state_lock);
1375 kmem_free(zv, sizeof (*zv));
1376 dmu_objset_disown(os, B_TRUE, FTAG);
1379 dev->si_iosize_max = maxphys;
1380 zsd->zsd_cdev = dev;
1382 (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1383 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1384 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1386 if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1387 zv->zv_flags |= ZVOL_RDONLY;
1389 zv->zv_volblocksize = doi->doi_data_block_size;
1390 zv->zv_volsize = volsize;
1393 ASSERT3P(zv->zv_zilog, ==, NULL);
1394 zv->zv_zilog = zil_open(os, zvol_get_data);
1395 if (spa_writeable(dmu_objset_spa(os))) {
1396 if (zil_replay_disable)
1397 zil_destroy(zv->zv_zilog, B_FALSE);
1399 zil_replay(os, zv, zvol_replay_vector);
1401 zil_close(zv->zv_zilog);
1402 zv->zv_zilog = NULL;
1403 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1404 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1406 /* TODO: prefetch for geom tasting */
1408 zv->zv_objset = NULL;
1409 out_dmu_objset_disown:
1410 dmu_objset_disown(os, B_TRUE, FTAG);
1412 if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1414 g_topology_unlock();
1417 kmem_free(doi, sizeof (dmu_object_info_t));
1419 rw_enter(&zvol_state_lock, RW_WRITER);
1422 rw_exit(&zvol_state_lock);
1423 ZFS_LOG(1, "ZVOL %s created.", name);
1430 zvol_clear_private(zvol_state_t *zv)
1432 ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1433 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1434 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1435 struct g_provider *pp = zsg->zsg_provider;
1437 if (pp->private == NULL) /* already cleared */
1440 mtx_lock(&zsg->zsg_queue_mtx);
1441 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1443 wakeup_one(&zsg->zsg_queue);
1444 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1445 msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1447 mtx_unlock(&zsg->zsg_queue_mtx);
1448 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1449 } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451 struct cdev *dev = zsd->zsd_cdev;
1453 dev->si_drv2 = NULL;
1458 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1460 zv->zv_volsize = volsize;
1461 if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1462 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1463 struct g_provider *pp = zsg->zsg_provider;
1467 if (pp->private == NULL) {
1468 g_topology_unlock();
1469 return (SET_ERROR(ENXIO));
1473 * Do not invoke resize event when initial size was zero.
1474 * ZVOL initializes the size on first open, this is not
1477 if (pp->mediasize == 0)
1478 pp->mediasize = zv->zv_volsize;
1480 g_resize_provider(pp, zv->zv_volsize);
1482 g_topology_unlock();
1488 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1490 // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1494 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1496 // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1499 const static zvol_platform_ops_t zvol_freebsd_ops = {
1500 .zv_free = zvol_free,
1501 .zv_rename_minor = zvol_rename_minor,
1502 .zv_create_minor = zvol_create_minor_impl,
1503 .zv_update_volsize = zvol_update_volsize,
1504 .zv_clear_private = zvol_clear_private,
1505 .zv_is_zvol = zvol_is_zvol_impl,
1506 .zv_set_disk_ro = zvol_set_disk_ro_impl,
1507 .zv_set_capacity = zvol_set_capacity_impl,
1517 return (zvol_minors != 0);
1524 zvol_register_ops(&zvol_freebsd_ops);