zfs: merge OpenZFS master-9305ff2ed
[freebsd.git] / sys / contrib / openzfs / module / os / freebsd / zfs / zvol_os.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
25  * All rights reserved.
26  *
27  * Portions Copyright 2010 Robert Milkowski
28  *
29  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
30  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
31  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
32  * Copyright (c) 2014 Integros [integros.com]
33  */
34
35 /* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
36
37 /*
38  * ZFS volume emulation driver.
39  *
40  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
41  * Volumes are accessed through the symbolic links named:
42  *
43  * /dev/zvol/<pool_name>/<dataset_name>
44  *
45  * Volumes are persistent through reboot.  No user command needs to be
46  * run before opening and using a device.
47  *
48  * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
49  * in the system. Except when they're simply character devices (volmode=dev).
50  */
51
52 #include <sys/types.h>
53 #include <sys/param.h>
54 #include <sys/kernel.h>
55 #include <sys/errno.h>
56 #include <sys/uio.h>
57 #include <sys/bio.h>
58 #include <sys/buf.h>
59 #include <sys/kmem.h>
60 #include <sys/conf.h>
61 #include <sys/cmn_err.h>
62 #include <sys/stat.h>
63 #include <sys/proc.h>
64 #include <sys/zap.h>
65 #include <sys/spa.h>
66 #include <sys/spa_impl.h>
67 #include <sys/zio.h>
68 #include <sys/disk.h>
69 #include <sys/dmu_traverse.h>
70 #include <sys/dnode.h>
71 #include <sys/dsl_dataset.h>
72 #include <sys/dsl_prop.h>
73 #include <sys/dsl_dir.h>
74 #include <sys/byteorder.h>
75 #include <sys/sunddi.h>
76 #include <sys/dirent.h>
77 #include <sys/policy.h>
78 #include <sys/queue.h>
79 #include <sys/fs/zfs.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zil.h>
82 #include <sys/zfs_znode.h>
83 #include <sys/zfs_rlock.h>
84 #include <sys/vdev_impl.h>
85 #include <sys/vdev_raidz.h>
86 #include <sys/zvol.h>
87 #include <sys/zil_impl.h>
88 #include <sys/dataset_kstats.h>
89 #include <sys/dbuf.h>
90 #include <sys/dmu_tx.h>
91 #include <sys/zfeature.h>
92 #include <sys/zio_checksum.h>
93 #include <sys/zil_impl.h>
94 #include <sys/filio.h>
95
96 #include <geom/geom.h>
97 #include <sys/zvol.h>
98 #include <sys/zvol_impl.h>
99
100 #include "zfs_namecheck.h"
101
102 #define ZVOL_DUMPSIZE           "dumpsize"
103
104 #ifdef ZVOL_LOCK_DEBUG
105 #define ZVOL_RW_READER          RW_WRITER
106 #define ZVOL_RW_READ_HELD       RW_WRITE_HELD
107 #else
108 #define ZVOL_RW_READER          RW_READER
109 #define ZVOL_RW_READ_HELD       RW_READ_HELD
110 #endif
111
112 enum zvol_geom_state {
113         ZVOL_GEOM_UNINIT,
114         ZVOL_GEOM_STOPPED,
115         ZVOL_GEOM_RUNNING,
116 };
117
118 struct zvol_state_os {
119 #define zso_dev         _zso_state._zso_dev
120 #define zso_geom        _zso_state._zso_geom
121         union {
122                 /* volmode=dev */
123                 struct zvol_state_dev {
124                         struct cdev *zsd_cdev;
125                         uint64_t zsd_sync_cnt;
126                 } _zso_dev;
127
128                 /* volmode=geom */
129                 struct zvol_state_geom {
130                         struct g_provider *zsg_provider;
131                         struct bio_queue_head zsg_queue;
132                         struct mtx zsg_queue_mtx;
133                         enum zvol_geom_state zsg_state;
134                 } _zso_geom;
135         } _zso_state;
136         int zso_dying;
137 };
138
139 static uint32_t zvol_minors;
140
141 SYSCTL_DECL(_vfs_zfs);
142 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
143 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
144         "Expose as GEOM providers (1), device files (2) or neither");
145 static boolean_t zpool_on_zvol = B_FALSE;
146 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
147         "Allow zpools to use zvols as vdevs (DANGEROUS)");
148
149 /*
150  * Toggle unmap functionality.
151  */
152 boolean_t zvol_unmap_enabled = B_TRUE;
153
154 SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
155         &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
156
157 /*
158  * zvol maximum transfer in one DMU tx.
159  */
160 int zvol_maxphys = DMU_MAX_ACCESS / 2;
161
162 static void zvol_ensure_zilog(zvol_state_t *zv);
163
164 static d_open_t         zvol_cdev_open;
165 static d_close_t        zvol_cdev_close;
166 static d_ioctl_t        zvol_cdev_ioctl;
167 static d_read_t         zvol_cdev_read;
168 static d_write_t        zvol_cdev_write;
169 static d_strategy_t     zvol_geom_bio_strategy;
170
171 static struct cdevsw zvol_cdevsw = {
172         .d_name =       "zvol",
173         .d_version =    D_VERSION,
174         .d_flags =      D_DISK | D_TRACKCLOSE,
175         .d_open =       zvol_cdev_open,
176         .d_close =      zvol_cdev_close,
177         .d_ioctl =      zvol_cdev_ioctl,
178         .d_read =       zvol_cdev_read,
179         .d_write =      zvol_cdev_write,
180         .d_strategy =   zvol_geom_bio_strategy,
181 };
182
183 extern uint_t zfs_geom_probe_vdev_key;
184
185 struct g_class zfs_zvol_class = {
186         .name = "ZFS::ZVOL",
187         .version = G_VERSION,
188 };
189
190 DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
191
192 static int zvol_geom_open(struct g_provider *pp, int flag, int count);
193 static int zvol_geom_close(struct g_provider *pp, int flag, int count);
194 static void zvol_geom_run(zvol_state_t *zv);
195 static void zvol_geom_destroy(zvol_state_t *zv);
196 static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
197 static void zvol_geom_worker(void *arg);
198 static void zvol_geom_bio_start(struct bio *bp);
199 static int zvol_geom_bio_getattr(struct bio *bp);
200 /* static d_strategy_t  zvol_geom_bio_strategy; (declared elsewhere) */
201
202 /*
203  * GEOM mode implementation
204  */
205
206 /*ARGSUSED*/
207 static int
208 zvol_geom_open(struct g_provider *pp, int flag, int count)
209 {
210         zvol_state_t *zv;
211         int err = 0;
212         boolean_t drop_suspend = B_FALSE;
213         boolean_t drop_namespace = B_FALSE;
214
215         if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
216                 /*
217                  * if zfs_geom_probe_vdev_key is set, that means that zfs is
218                  * attempting to probe geom providers while looking for a
219                  * replacement for a missing VDEV.  In this case, the
220                  * spa_namespace_lock will not be held, but it is still illegal
221                  * to use a zvol as a vdev.  Deadlocks can result if another
222                  * thread has spa_namespace_lock
223                  */
224                 return (SET_ERROR(EOPNOTSUPP));
225         }
226
227 retry:
228         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
229         zv = pp->private;
230         if (zv == NULL) {
231                 rw_exit(&zvol_state_lock);
232                 err = SET_ERROR(ENXIO);
233                 goto out_locked;
234         }
235
236         if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
237                 /*
238                  * We need to guarantee that the namespace lock is held
239                  * to avoid spurious failures in zvol_first_open.
240                  */
241                 drop_namespace = B_TRUE;
242                 if (!mutex_tryenter(&spa_namespace_lock)) {
243                         rw_exit(&zvol_state_lock);
244                         mutex_enter(&spa_namespace_lock);
245                         goto retry;
246                 }
247         }
248         mutex_enter(&zv->zv_state_lock);
249         if (zv->zv_zso->zso_dying) {
250                 rw_exit(&zvol_state_lock);
251                 err = SET_ERROR(ENXIO);
252                 goto out_zv_locked;
253         }
254         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
255
256         /*
257          * make sure zvol is not suspended during first open
258          * (hold zv_suspend_lock) and respect proper lock acquisition
259          * ordering - zv_suspend_lock before zv_state_lock
260          */
261         if (zv->zv_open_count == 0) {
262                 drop_suspend = B_TRUE;
263                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
264                         mutex_exit(&zv->zv_state_lock);
265                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
266                         mutex_enter(&zv->zv_state_lock);
267                         /* check to see if zv_suspend_lock is needed */
268                         if (zv->zv_open_count != 0) {
269                                 rw_exit(&zv->zv_suspend_lock);
270                                 drop_suspend = B_FALSE;
271                         }
272                 }
273         }
274         rw_exit(&zvol_state_lock);
275
276         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
277
278         if (zv->zv_open_count == 0) {
279                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
280                 err = zvol_first_open(zv, !(flag & FWRITE));
281                 if (err)
282                         goto out_zv_locked;
283                 pp->mediasize = zv->zv_volsize;
284                 pp->stripeoffset = 0;
285                 pp->stripesize = zv->zv_volblocksize;
286         }
287
288         /*
289          * Check for a bad on-disk format version now since we
290          * lied about owning the dataset readonly before.
291          */
292         if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
293             dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
294                 err = SET_ERROR(EROFS);
295                 goto out_opened;
296         }
297         if (zv->zv_flags & ZVOL_EXCL) {
298                 err = SET_ERROR(EBUSY);
299                 goto out_opened;
300         }
301 #ifdef FEXCL
302         if (flag & FEXCL) {
303                 if (zv->zv_open_count != 0) {
304                         err = SET_ERROR(EBUSY);
305                         goto out_opened;
306                 }
307                 zv->zv_flags |= ZVOL_EXCL;
308         }
309 #endif
310
311         zv->zv_open_count += count;
312 out_opened:
313         if (zv->zv_open_count == 0) {
314                 zvol_last_close(zv);
315                 wakeup(zv);
316         }
317 out_zv_locked:
318         mutex_exit(&zv->zv_state_lock);
319 out_locked:
320         if (drop_namespace)
321                 mutex_exit(&spa_namespace_lock);
322         if (drop_suspend)
323                 rw_exit(&zv->zv_suspend_lock);
324         return (err);
325 }
326
327 /*ARGSUSED*/
328 static int
329 zvol_geom_close(struct g_provider *pp, int flag, int count)
330 {
331         zvol_state_t *zv;
332         boolean_t drop_suspend = B_TRUE;
333         int new_open_count;
334
335         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
336         zv = pp->private;
337         if (zv == NULL) {
338                 rw_exit(&zvol_state_lock);
339                 return (SET_ERROR(ENXIO));
340         }
341
342         mutex_enter(&zv->zv_state_lock);
343         if (zv->zv_flags & ZVOL_EXCL) {
344                 ASSERT3U(zv->zv_open_count, ==, 1);
345                 zv->zv_flags &= ~ZVOL_EXCL;
346         }
347
348         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
349
350         /*
351          * If the open count is zero, this is a spurious close.
352          * That indicates a bug in the kernel / DDI framework.
353          */
354         ASSERT3U(zv->zv_open_count, >, 0);
355
356         /*
357          * make sure zvol is not suspended during last close
358          * (hold zv_suspend_lock) and respect proper lock acquisition
359          * ordering - zv_suspend_lock before zv_state_lock
360          */
361         new_open_count = zv->zv_open_count - count;
362         if (new_open_count == 0) {
363                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
364                         mutex_exit(&zv->zv_state_lock);
365                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
366                         mutex_enter(&zv->zv_state_lock);
367                         /* check to see if zv_suspend_lock is needed */
368                         new_open_count = zv->zv_open_count - count;
369                         if (new_open_count != 0) {
370                                 rw_exit(&zv->zv_suspend_lock);
371                                 drop_suspend = B_FALSE;
372                         }
373                 }
374         } else {
375                 drop_suspend = B_FALSE;
376         }
377         rw_exit(&zvol_state_lock);
378
379         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
380
381         /*
382          * You may get multiple opens, but only one close.
383          */
384         zv->zv_open_count = new_open_count;
385         if (zv->zv_open_count == 0) {
386                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
387                 zvol_last_close(zv);
388                 wakeup(zv);
389         }
390
391         mutex_exit(&zv->zv_state_lock);
392
393         if (drop_suspend)
394                 rw_exit(&zv->zv_suspend_lock);
395         return (0);
396 }
397
398 static void
399 zvol_geom_run(zvol_state_t *zv)
400 {
401         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
402         struct g_provider *pp = zsg->zsg_provider;
403
404         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
405
406         g_error_provider(pp, 0);
407
408         kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
409             "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
410 }
411
412 static void
413 zvol_geom_destroy(zvol_state_t *zv)
414 {
415         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
416         struct g_provider *pp = zsg->zsg_provider;
417
418         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
419
420         g_topology_assert();
421
422         mutex_enter(&zv->zv_state_lock);
423         VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
424         mutex_exit(&zv->zv_state_lock);
425         zsg->zsg_provider = NULL;
426         g_wither_geom(pp->geom, ENXIO);
427 }
428
429 void
430 zvol_wait_close(zvol_state_t *zv)
431 {
432
433         if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
434                 return;
435         mutex_enter(&zv->zv_state_lock);
436         zv->zv_zso->zso_dying = B_TRUE;
437
438         if (zv->zv_open_count)
439                 msleep(zv, &zv->zv_state_lock,
440                     PRIBIO, "zvol:dying", 10*hz);
441         mutex_exit(&zv->zv_state_lock);
442 }
443
444
445 static int
446 zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
447 {
448         int count, error, flags;
449
450         g_topology_assert();
451
452         /*
453          * To make it easier we expect either open or close, but not both
454          * at the same time.
455          */
456         KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
457             (acr <= 0 && acw <= 0 && ace <= 0),
458             ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
459             pp->name, acr, acw, ace));
460
461         if (pp->private == NULL) {
462                 if (acr <= 0 && acw <= 0 && ace <= 0)
463                         return (0);
464                 return (pp->error);
465         }
466
467         /*
468          * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
469          * ace != 0, because GEOM already handles that and handles it a bit
470          * differently. GEOM allows for multiple read/exclusive consumers and
471          * ZFS allows only one exclusive consumer, no matter if it is reader or
472          * writer. I like better the way GEOM works so I'll leave it for GEOM
473          * to decide what to do.
474          */
475
476         count = acr + acw + ace;
477         if (count == 0)
478                 return (0);
479
480         flags = 0;
481         if (acr != 0 || ace != 0)
482                 flags |= FREAD;
483         if (acw != 0)
484                 flags |= FWRITE;
485
486         g_topology_unlock();
487         if (count > 0)
488                 error = zvol_geom_open(pp, flags, count);
489         else
490                 error = zvol_geom_close(pp, flags, -count);
491         g_topology_lock();
492         return (error);
493 }
494
495 static void
496 zvol_geom_worker(void *arg)
497 {
498         zvol_state_t *zv = arg;
499         struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
500         struct bio *bp;
501
502         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
503
504         thread_lock(curthread);
505         sched_prio(curthread, PRIBIO);
506         thread_unlock(curthread);
507
508         for (;;) {
509                 mtx_lock(&zsg->zsg_queue_mtx);
510                 bp = bioq_takefirst(&zsg->zsg_queue);
511                 if (bp == NULL) {
512                         if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
513                                 zsg->zsg_state = ZVOL_GEOM_RUNNING;
514                                 wakeup(&zsg->zsg_state);
515                                 mtx_unlock(&zsg->zsg_queue_mtx);
516                                 kthread_exit();
517                         }
518                         msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
519                             PRIBIO | PDROP, "zvol:io", 0);
520                         continue;
521                 }
522                 mtx_unlock(&zsg->zsg_queue_mtx);
523                 zvol_geom_bio_strategy(bp);
524         }
525 }
526
527 static void
528 zvol_geom_bio_start(struct bio *bp)
529 {
530         zvol_state_t *zv = bp->bio_to->private;
531         struct zvol_state_geom *zsg;
532         boolean_t first;
533
534         if (zv == NULL) {
535                 g_io_deliver(bp, ENXIO);
536                 return;
537         }
538         if (bp->bio_cmd == BIO_GETATTR) {
539                 if (zvol_geom_bio_getattr(bp))
540                         g_io_deliver(bp, EOPNOTSUPP);
541                 return;
542         }
543
544         if (!THREAD_CAN_SLEEP()) {
545                 zsg = &zv->zv_zso->zso_geom;
546                 mtx_lock(&zsg->zsg_queue_mtx);
547                 first = (bioq_first(&zsg->zsg_queue) == NULL);
548                 bioq_insert_tail(&zsg->zsg_queue, bp);
549                 mtx_unlock(&zsg->zsg_queue_mtx);
550                 if (first)
551                         wakeup_one(&zsg->zsg_queue);
552                 return;
553         }
554
555         zvol_geom_bio_strategy(bp);
556 }
557
558 static int
559 zvol_geom_bio_getattr(struct bio *bp)
560 {
561         zvol_state_t *zv;
562
563         zv = bp->bio_to->private;
564         ASSERT3P(zv, !=, NULL);
565
566         spa_t *spa = dmu_objset_spa(zv->zv_objset);
567         uint64_t refd, avail, usedobjs, availobjs;
568
569         if (g_handleattr_int(bp, "GEOM::candelete", 1))
570                 return (0);
571         if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
572                 dmu_objset_space(zv->zv_objset, &refd, &avail,
573                     &usedobjs, &availobjs);
574                 if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
575                         return (0);
576         } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
577                 dmu_objset_space(zv->zv_objset, &refd, &avail,
578                     &usedobjs, &availobjs);
579                 if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
580                         return (0);
581         } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
582                 avail = metaslab_class_get_space(spa_normal_class(spa));
583                 avail -= metaslab_class_get_alloc(spa_normal_class(spa));
584                 if (g_handleattr_off_t(bp, "poolblocksavail",
585                     avail / DEV_BSIZE))
586                         return (0);
587         } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
588                 refd = metaslab_class_get_alloc(spa_normal_class(spa));
589                 if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
590                         return (0);
591         }
592         return (1);
593 }
594
595 static void
596 zvol_geom_bio_strategy(struct bio *bp)
597 {
598         zvol_state_t *zv;
599         uint64_t off, volsize;
600         size_t resid;
601         char *addr;
602         objset_t *os;
603         zfs_locked_range_t *lr;
604         int error = 0;
605         boolean_t doread = B_FALSE;
606         boolean_t is_dumpified;
607         boolean_t sync;
608
609         if (bp->bio_to)
610                 zv = bp->bio_to->private;
611         else
612                 zv = bp->bio_dev->si_drv2;
613
614         if (zv == NULL) {
615                 error = SET_ERROR(ENXIO);
616                 goto out;
617         }
618
619         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
620
621         switch (bp->bio_cmd) {
622         case BIO_READ:
623                 doread = B_TRUE;
624                 break;
625         case BIO_WRITE:
626         case BIO_FLUSH:
627         case BIO_DELETE:
628                 if (zv->zv_flags & ZVOL_RDONLY) {
629                         error = SET_ERROR(EROFS);
630                         goto resume;
631                 }
632                 zvol_ensure_zilog(zv);
633                 if (bp->bio_cmd == BIO_FLUSH)
634                         goto sync;
635                 break;
636         default:
637                 error = SET_ERROR(EOPNOTSUPP);
638                 goto resume;
639         }
640
641         off = bp->bio_offset;
642         volsize = zv->zv_volsize;
643
644         os = zv->zv_objset;
645         ASSERT3P(os, !=, NULL);
646
647         addr = bp->bio_data;
648         resid = bp->bio_length;
649
650         if (resid > 0 && off >= volsize) {
651                 error = SET_ERROR(EIO);
652                 goto resume;
653         }
654
655         is_dumpified = B_FALSE;
656         sync = !doread && !is_dumpified &&
657             zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
658
659         /*
660          * There must be no buffer changes when doing a dmu_sync() because
661          * we can't change the data whilst calculating the checksum.
662          */
663         lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
664             doread ? RL_READER : RL_WRITER);
665
666         if (bp->bio_cmd == BIO_DELETE) {
667                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
668                 error = dmu_tx_assign(tx, TXG_WAIT);
669                 if (error != 0) {
670                         dmu_tx_abort(tx);
671                 } else {
672                         zvol_log_truncate(zv, tx, off, resid, sync);
673                         dmu_tx_commit(tx);
674                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
675                             off, resid);
676                         resid = 0;
677                 }
678                 goto unlock;
679         }
680         while (resid != 0 && off < volsize) {
681                 size_t size = MIN(resid, zvol_maxphys);
682                 if (doread) {
683                         error = dmu_read(os, ZVOL_OBJ, off, size, addr,
684                             DMU_READ_PREFETCH);
685                 } else {
686                         dmu_tx_t *tx = dmu_tx_create(os);
687                         dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
688                         error = dmu_tx_assign(tx, TXG_WAIT);
689                         if (error) {
690                                 dmu_tx_abort(tx);
691                         } else {
692                                 dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
693                                 zvol_log_write(zv, tx, off, size, sync);
694                                 dmu_tx_commit(tx);
695                         }
696                 }
697                 if (error) {
698                         /* convert checksum errors into IO errors */
699                         if (error == ECKSUM)
700                                 error = SET_ERROR(EIO);
701                         break;
702                 }
703                 off += size;
704                 addr += size;
705                 resid -= size;
706         }
707 unlock:
708         zfs_rangelock_exit(lr);
709
710         bp->bio_completed = bp->bio_length - resid;
711         if (bp->bio_completed < bp->bio_length && off > volsize)
712                 error = SET_ERROR(EINVAL);
713
714         switch (bp->bio_cmd) {
715         case BIO_FLUSH:
716                 break;
717         case BIO_READ:
718                 dataset_kstats_update_read_kstats(&zv->zv_kstat,
719                     bp->bio_completed);
720                 break;
721         case BIO_WRITE:
722                 dataset_kstats_update_write_kstats(&zv->zv_kstat,
723                     bp->bio_completed);
724                 break;
725         case BIO_DELETE:
726                 break;
727         default:
728                 break;
729         }
730
731         if (sync) {
732 sync:
733                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
734         }
735 resume:
736         rw_exit(&zv->zv_suspend_lock);
737 out:
738         if (bp->bio_to)
739                 g_io_deliver(bp, error);
740         else
741                 biofinish(bp, NULL, error);
742 }
743
744 /*
745  * Character device mode implementation
746  */
747
748 static int
749 zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
750 {
751         zvol_state_t *zv;
752         uint64_t volsize;
753         zfs_locked_range_t *lr;
754         int error = 0;
755         zfs_uio_t uio;
756
757         zfs_uio_init(&uio, uio_s);
758
759         zv = dev->si_drv2;
760
761         volsize = zv->zv_volsize;
762         /*
763          * uio_loffset == volsize isn't an error as
764          * its required for EOF processing.
765          */
766         if (zfs_uio_resid(&uio) > 0 &&
767             (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
768                 return (SET_ERROR(EIO));
769
770         lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
771             zfs_uio_resid(&uio), RL_READER);
772         while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
773                 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
774
775                 /* don't read past the end */
776                 if (bytes > volsize - zfs_uio_offset(&uio))
777                         bytes = volsize - zfs_uio_offset(&uio);
778
779                 error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
780                 if (error) {
781                         /* convert checksum errors into IO errors */
782                         if (error == ECKSUM)
783                                 error = SET_ERROR(EIO);
784                         break;
785                 }
786         }
787         zfs_rangelock_exit(lr);
788
789         return (error);
790 }
791
792 static int
793 zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
794 {
795         zvol_state_t *zv;
796         uint64_t volsize;
797         zfs_locked_range_t *lr;
798         int error = 0;
799         boolean_t sync;
800         zfs_uio_t uio;
801
802         zv = dev->si_drv2;
803
804         volsize = zv->zv_volsize;
805
806         zfs_uio_init(&uio, uio_s);
807
808         if (zfs_uio_resid(&uio) > 0 &&
809             (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
810                 return (SET_ERROR(EIO));
811
812         sync = (ioflag & IO_SYNC) ||
813             (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
814
815         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
816         zvol_ensure_zilog(zv);
817
818         lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
819             zfs_uio_resid(&uio), RL_WRITER);
820         while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
821                 uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
822                 uint64_t off = zfs_uio_offset(&uio);
823                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
824
825                 if (bytes > volsize - off)      /* don't write past the end */
826                         bytes = volsize - off;
827
828                 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
829                 error = dmu_tx_assign(tx, TXG_WAIT);
830                 if (error) {
831                         dmu_tx_abort(tx);
832                         break;
833                 }
834                 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
835                 if (error == 0)
836                         zvol_log_write(zv, tx, off, bytes, sync);
837                 dmu_tx_commit(tx);
838
839                 if (error)
840                         break;
841         }
842         zfs_rangelock_exit(lr);
843         if (sync)
844                 zil_commit(zv->zv_zilog, ZVOL_OBJ);
845         rw_exit(&zv->zv_suspend_lock);
846         return (error);
847 }
848
849 static int
850 zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
851 {
852         zvol_state_t *zv;
853         struct zvol_state_dev *zsd;
854         int err = 0;
855         boolean_t drop_suspend = B_FALSE;
856         boolean_t drop_namespace = B_FALSE;
857
858 retry:
859         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
860         zv = dev->si_drv2;
861         if (zv == NULL) {
862                 rw_exit(&zvol_state_lock);
863                 err = SET_ERROR(ENXIO);
864                 goto out_locked;
865         }
866
867         if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
868                 /*
869                  * We need to guarantee that the namespace lock is held
870                  * to avoid spurious failures in zvol_first_open.
871                  */
872                 drop_namespace = B_TRUE;
873                 if (!mutex_tryenter(&spa_namespace_lock)) {
874                         rw_exit(&zvol_state_lock);
875                         mutex_enter(&spa_namespace_lock);
876                         goto retry;
877                 }
878         }
879         mutex_enter(&zv->zv_state_lock);
880
881         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
882
883         /*
884          * make sure zvol is not suspended during first open
885          * (hold zv_suspend_lock) and respect proper lock acquisition
886          * ordering - zv_suspend_lock before zv_state_lock
887          */
888         if (zv->zv_open_count == 0) {
889                 drop_suspend = B_TRUE;
890                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
891                         mutex_exit(&zv->zv_state_lock);
892                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
893                         mutex_enter(&zv->zv_state_lock);
894                         /* check to see if zv_suspend_lock is needed */
895                         if (zv->zv_open_count != 0) {
896                                 rw_exit(&zv->zv_suspend_lock);
897                                 drop_suspend = B_FALSE;
898                         }
899                 }
900         }
901         rw_exit(&zvol_state_lock);
902
903         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
904
905         if (zv->zv_open_count == 0) {
906                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
907                 err = zvol_first_open(zv, !(flags & FWRITE));
908                 if (err)
909                         goto out_zv_locked;
910         }
911
912         if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
913                 err = SET_ERROR(EROFS);
914                 goto out_opened;
915         }
916         if (zv->zv_flags & ZVOL_EXCL) {
917                 err = SET_ERROR(EBUSY);
918                 goto out_opened;
919         }
920 #ifdef FEXCL
921         if (flags & FEXCL) {
922                 if (zv->zv_open_count != 0) {
923                         err = SET_ERROR(EBUSY);
924                         goto out_opened;
925                 }
926                 zv->zv_flags |= ZVOL_EXCL;
927         }
928 #endif
929
930         zv->zv_open_count++;
931         if (flags & (FSYNC | FDSYNC)) {
932                 zsd = &zv->zv_zso->zso_dev;
933                 zsd->zsd_sync_cnt++;
934                 if (zsd->zsd_sync_cnt == 1 &&
935                     (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
936                         zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
937         }
938 out_opened:
939         if (zv->zv_open_count == 0) {
940                 zvol_last_close(zv);
941                 wakeup(zv);
942         }
943 out_zv_locked:
944         mutex_exit(&zv->zv_state_lock);
945 out_locked:
946         if (drop_namespace)
947                 mutex_exit(&spa_namespace_lock);
948         if (drop_suspend)
949                 rw_exit(&zv->zv_suspend_lock);
950         return (err);
951 }
952
953 static int
954 zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
955 {
956         zvol_state_t *zv;
957         struct zvol_state_dev *zsd;
958         boolean_t drop_suspend = B_TRUE;
959
960         rw_enter(&zvol_state_lock, ZVOL_RW_READER);
961         zv = dev->si_drv2;
962         if (zv == NULL) {
963                 rw_exit(&zvol_state_lock);
964                 return (SET_ERROR(ENXIO));
965         }
966
967         mutex_enter(&zv->zv_state_lock);
968         if (zv->zv_flags & ZVOL_EXCL) {
969                 ASSERT3U(zv->zv_open_count, ==, 1);
970                 zv->zv_flags &= ~ZVOL_EXCL;
971         }
972
973         ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
974
975         /*
976          * If the open count is zero, this is a spurious close.
977          * That indicates a bug in the kernel / DDI framework.
978          */
979         ASSERT3U(zv->zv_open_count, >, 0);
980         /*
981          * make sure zvol is not suspended during last close
982          * (hold zv_suspend_lock) and respect proper lock acquisition
983          * ordering - zv_suspend_lock before zv_state_lock
984          */
985         if (zv->zv_open_count == 1) {
986                 if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
987                         mutex_exit(&zv->zv_state_lock);
988                         rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
989                         mutex_enter(&zv->zv_state_lock);
990                         /* check to see if zv_suspend_lock is needed */
991                         if (zv->zv_open_count != 1) {
992                                 rw_exit(&zv->zv_suspend_lock);
993                                 drop_suspend = B_FALSE;
994                         }
995                 }
996         } else {
997                 drop_suspend = B_FALSE;
998         }
999         rw_exit(&zvol_state_lock);
1000
1001         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1002
1003         /*
1004          * You may get multiple opens, but only one close.
1005          */
1006         zv->zv_open_count--;
1007         if (flags & (FSYNC | FDSYNC)) {
1008                 zsd = &zv->zv_zso->zso_dev;
1009                 zsd->zsd_sync_cnt--;
1010         }
1011
1012         if (zv->zv_open_count == 0) {
1013                 ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1014                 zvol_last_close(zv);
1015                 wakeup(zv);
1016         }
1017
1018         mutex_exit(&zv->zv_state_lock);
1019
1020         if (drop_suspend)
1021                 rw_exit(&zv->zv_suspend_lock);
1022         return (0);
1023 }
1024
1025 static int
1026 zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
1027     int fflag, struct thread *td)
1028 {
1029         zvol_state_t *zv;
1030         zfs_locked_range_t *lr;
1031         off_t offset, length;
1032         int i, error;
1033         boolean_t sync;
1034
1035         zv = dev->si_drv2;
1036
1037         error = 0;
1038         KASSERT(zv->zv_open_count > 0,
1039             ("Device with zero access count in %s", __func__));
1040
1041         i = IOCPARM_LEN(cmd);
1042         switch (cmd) {
1043         case DIOCGSECTORSIZE:
1044                 *(uint32_t *)data = DEV_BSIZE;
1045                 break;
1046         case DIOCGMEDIASIZE:
1047                 *(off_t *)data = zv->zv_volsize;
1048                 break;
1049         case DIOCGFLUSH:
1050                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1051                 if (zv->zv_zilog != NULL)
1052                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1053                 rw_exit(&zv->zv_suspend_lock);
1054                 break;
1055         case DIOCGDELETE:
1056                 if (!zvol_unmap_enabled)
1057                         break;
1058
1059                 offset = ((off_t *)data)[0];
1060                 length = ((off_t *)data)[1];
1061                 if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
1062                     offset < 0 || offset >= zv->zv_volsize ||
1063                     length <= 0) {
1064                         printf("%s: offset=%jd length=%jd\n", __func__, offset,
1065                             length);
1066                         error = SET_ERROR(EINVAL);
1067                         break;
1068                 }
1069                 rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
1070                 zvol_ensure_zilog(zv);
1071                 lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
1072                     RL_WRITER);
1073                 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
1074                 error = dmu_tx_assign(tx, TXG_WAIT);
1075                 if (error != 0) {
1076                         sync = FALSE;
1077                         dmu_tx_abort(tx);
1078                 } else {
1079                         sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
1080                         zvol_log_truncate(zv, tx, offset, length, sync);
1081                         dmu_tx_commit(tx);
1082                         error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1083                             offset, length);
1084                 }
1085                 zfs_rangelock_exit(lr);
1086                 if (sync)
1087                         zil_commit(zv->zv_zilog, ZVOL_OBJ);
1088                 rw_exit(&zv->zv_suspend_lock);
1089                 break;
1090         case DIOCGSTRIPESIZE:
1091                 *(off_t *)data = zv->zv_volblocksize;
1092                 break;
1093         case DIOCGSTRIPEOFFSET:
1094                 *(off_t *)data = 0;
1095                 break;
1096         case DIOCGATTR: {
1097                 spa_t *spa = dmu_objset_spa(zv->zv_objset);
1098                 struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
1099                 uint64_t refd, avail, usedobjs, availobjs;
1100
1101                 if (strcmp(arg->name, "GEOM::candelete") == 0)
1102                         arg->value.i = 1;
1103                 else if (strcmp(arg->name, "blocksavail") == 0) {
1104                         dmu_objset_space(zv->zv_objset, &refd, &avail,
1105                             &usedobjs, &availobjs);
1106                         arg->value.off = avail / DEV_BSIZE;
1107                 } else if (strcmp(arg->name, "blocksused") == 0) {
1108                         dmu_objset_space(zv->zv_objset, &refd, &avail,
1109                             &usedobjs, &availobjs);
1110                         arg->value.off = refd / DEV_BSIZE;
1111                 } else if (strcmp(arg->name, "poolblocksavail") == 0) {
1112                         avail = metaslab_class_get_space(spa_normal_class(spa));
1113                         avail -= metaslab_class_get_alloc(
1114                             spa_normal_class(spa));
1115                         arg->value.off = avail / DEV_BSIZE;
1116                 } else if (strcmp(arg->name, "poolblocksused") == 0) {
1117                         refd = metaslab_class_get_alloc(spa_normal_class(spa));
1118                         arg->value.off = refd / DEV_BSIZE;
1119                 } else
1120                         error = SET_ERROR(ENOIOCTL);
1121                 break;
1122         }
1123         case FIOSEEKHOLE:
1124         case FIOSEEKDATA: {
1125                 off_t *off = (off_t *)data;
1126                 uint64_t noff;
1127                 boolean_t hole;
1128
1129                 hole = (cmd == FIOSEEKHOLE);
1130                 noff = *off;
1131                 error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
1132                 *off = noff;
1133                 break;
1134         }
1135         default:
1136                 error = SET_ERROR(ENOIOCTL);
1137         }
1138
1139         return (error);
1140 }
1141
1142 /*
1143  * Misc. helpers
1144  */
1145
1146 static void
1147 zvol_ensure_zilog(zvol_state_t *zv)
1148 {
1149         ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
1150
1151         /*
1152          * Open a ZIL if this is the first time we have written to this
1153          * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
1154          * than zv_state_lock so that we don't need to acquire an
1155          * additional lock in this path.
1156          */
1157         if (zv->zv_zilog == NULL) {
1158                 if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
1159                         rw_exit(&zv->zv_suspend_lock);
1160                         rw_enter(&zv->zv_suspend_lock, RW_WRITER);
1161                 }
1162                 if (zv->zv_zilog == NULL) {
1163                         zv->zv_zilog = zil_open(zv->zv_objset,
1164                             zvol_get_data);
1165                         zv->zv_flags |= ZVOL_WRITTEN_TO;
1166                         /* replay / destroy done in zvol_create_minor_impl() */
1167                         VERIFY0((zv->zv_zilog->zl_header->zh_flags &
1168                             ZIL_REPLAY_NEEDED));
1169                 }
1170                 rw_downgrade(&zv->zv_suspend_lock);
1171         }
1172 }
1173
1174 static boolean_t
1175 zvol_is_zvol_impl(const char *device)
1176 {
1177         return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
1178 }
1179
1180 static void
1181 zvol_rename_minor(zvol_state_t *zv, const char *newname)
1182 {
1183         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1184         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1185
1186         /* move to new hashtable entry  */
1187         zv->zv_hash = zvol_name_hash(zv->zv_name);
1188         hlist_del(&zv->zv_hlink);
1189         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
1190
1191         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1192                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1193                 struct g_provider *pp = zsg->zsg_provider;
1194                 struct g_geom *gp;
1195
1196                 g_topology_lock();
1197                 gp = pp->geom;
1198                 ASSERT3P(gp, !=, NULL);
1199
1200                 zsg->zsg_provider = NULL;
1201                 g_wither_provider(pp, ENXIO);
1202
1203                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
1204                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1205                 pp->sectorsize = DEV_BSIZE;
1206                 pp->mediasize = zv->zv_volsize;
1207                 pp->private = zv;
1208                 zsg->zsg_provider = pp;
1209                 g_error_provider(pp, 0);
1210                 g_topology_unlock();
1211         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1212                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1213                 struct cdev *dev;
1214                 struct make_dev_args args;
1215
1216                 dev = zsd->zsd_cdev;
1217                 if (dev != NULL) {
1218                         destroy_dev(dev);
1219                         dev = zsd->zsd_cdev = NULL;
1220                         if (zv->zv_open_count > 0) {
1221                                 zv->zv_flags &= ~ZVOL_EXCL;
1222                                 zv->zv_open_count = 0;
1223                                 /* XXX  need suspend lock but lock order */
1224                                 zvol_last_close(zv);
1225                         }
1226                 }
1227
1228                 make_dev_args_init(&args);
1229                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1230                 args.mda_devsw = &zvol_cdevsw;
1231                 args.mda_cr = NULL;
1232                 args.mda_uid = UID_ROOT;
1233                 args.mda_gid = GID_OPERATOR;
1234                 args.mda_mode = 0640;
1235                 args.mda_si_drv2 = zv;
1236                 if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
1237                     == 0) {
1238                         dev->si_iosize_max = maxphys;
1239                         zsd->zsd_cdev = dev;
1240                 }
1241         }
1242         strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
1243 }
1244
1245 /*
1246  * Remove minor node for the specified volume.
1247  */
1248 static void
1249 zvol_free(zvol_state_t *zv)
1250 {
1251         ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1252         ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
1253         ASSERT0(zv->zv_open_count);
1254
1255         ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
1256
1257         rw_destroy(&zv->zv_suspend_lock);
1258         zfs_rangelock_fini(&zv->zv_rangelock);
1259
1260         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1261                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1262                 struct g_provider *pp __maybe_unused = zsg->zsg_provider;
1263
1264                 ASSERT3P(pp->private, ==, NULL);
1265
1266                 g_topology_lock();
1267                 zvol_geom_destroy(zv);
1268                 g_topology_unlock();
1269                 mtx_destroy(&zsg->zsg_queue_mtx);
1270         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1271                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1272                 struct cdev *dev = zsd->zsd_cdev;
1273
1274                 ASSERT3P(dev->si_drv2, ==, NULL);
1275
1276                 destroy_dev(dev);
1277         }
1278
1279         mutex_destroy(&zv->zv_state_lock);
1280         dataset_kstats_destroy(&zv->zv_kstat);
1281         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1282         kmem_free(zv, sizeof (zvol_state_t));
1283         zvol_minors--;
1284 }
1285
1286 /*
1287  * Create a minor node (plus a whole lot more) for the specified volume.
1288  */
1289 static int
1290 zvol_create_minor_impl(const char *name)
1291 {
1292         zvol_state_t *zv;
1293         objset_t *os;
1294         dmu_object_info_t *doi;
1295         uint64_t volsize;
1296         uint64_t volmode, hash;
1297         int error;
1298
1299         ZFS_LOG(1, "Creating ZVOL %s...", name);
1300         hash = zvol_name_hash(name);
1301         if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
1302                 ASSERT(MUTEX_HELD(&zv->zv_state_lock));
1303                 mutex_exit(&zv->zv_state_lock);
1304                 return (SET_ERROR(EEXIST));
1305         }
1306
1307         DROP_GIANT();
1308
1309         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
1310
1311         /* lie and say we're read-only */
1312         error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
1313         if (error)
1314                 goto out_doi;
1315
1316         error = dmu_object_info(os, ZVOL_OBJ, doi);
1317         if (error)
1318                 goto out_dmu_objset_disown;
1319
1320         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
1321         if (error)
1322                 goto out_dmu_objset_disown;
1323
1324         error = dsl_prop_get_integer(name,
1325             zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
1326         if (error || volmode == ZFS_VOLMODE_DEFAULT)
1327                 volmode = zvol_volmode;
1328         error = 0;
1329
1330         /*
1331          * zvol_alloc equivalent ...
1332          */
1333         zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
1334         zv->zv_hash = hash;
1335         mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
1336         zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
1337         zv->zv_volmode = volmode;
1338         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1339                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1340                 struct g_provider *pp;
1341                 struct g_geom *gp;
1342
1343                 zsg->zsg_state = ZVOL_GEOM_UNINIT;
1344                 mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
1345
1346                 g_topology_lock();
1347                 gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
1348                 gp->start = zvol_geom_bio_start;
1349                 gp->access = zvol_geom_access;
1350                 pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
1351                 pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
1352                 pp->sectorsize = DEV_BSIZE;
1353                 pp->mediasize = 0;
1354                 pp->private = zv;
1355
1356                 zsg->zsg_provider = pp;
1357                 bioq_init(&zsg->zsg_queue);
1358         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1359                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1360                 struct cdev *dev;
1361                 struct make_dev_args args;
1362
1363                 make_dev_args_init(&args);
1364                 args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
1365                 args.mda_devsw = &zvol_cdevsw;
1366                 args.mda_cr = NULL;
1367                 args.mda_uid = UID_ROOT;
1368                 args.mda_gid = GID_OPERATOR;
1369                 args.mda_mode = 0640;
1370                 args.mda_si_drv2 = zv;
1371                 error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
1372                 if (error) {
1373                         kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
1374                         mutex_destroy(&zv->zv_state_lock);
1375                         kmem_free(zv, sizeof (*zv));
1376                         dmu_objset_disown(os, B_TRUE, FTAG);
1377                         goto out_doi;
1378                 }
1379                 dev->si_iosize_max = maxphys;
1380                 zsd->zsd_cdev = dev;
1381         }
1382         (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
1383         rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
1384         zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
1385
1386         if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
1387                 zv->zv_flags |= ZVOL_RDONLY;
1388
1389         zv->zv_volblocksize = doi->doi_data_block_size;
1390         zv->zv_volsize = volsize;
1391         zv->zv_objset = os;
1392
1393         ASSERT3P(zv->zv_zilog, ==, NULL);
1394         zv->zv_zilog = zil_open(os, zvol_get_data);
1395         if (spa_writeable(dmu_objset_spa(os))) {
1396                 if (zil_replay_disable)
1397                         zil_destroy(zv->zv_zilog, B_FALSE);
1398                 else
1399                         zil_replay(os, zv, zvol_replay_vector);
1400         }
1401         zil_close(zv->zv_zilog);
1402         zv->zv_zilog = NULL;
1403         ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
1404         dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
1405
1406         /* TODO: prefetch for geom tasting */
1407
1408         zv->zv_objset = NULL;
1409 out_dmu_objset_disown:
1410         dmu_objset_disown(os, B_TRUE, FTAG);
1411
1412         if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
1413                 zvol_geom_run(zv);
1414                 g_topology_unlock();
1415         }
1416 out_doi:
1417         kmem_free(doi, sizeof (dmu_object_info_t));
1418         if (error == 0) {
1419                 rw_enter(&zvol_state_lock, RW_WRITER);
1420                 zvol_insert(zv);
1421                 zvol_minors++;
1422                 rw_exit(&zvol_state_lock);
1423                 ZFS_LOG(1, "ZVOL %s created.", name);
1424         }
1425         PICKUP_GIANT();
1426         return (error);
1427 }
1428
1429 static void
1430 zvol_clear_private(zvol_state_t *zv)
1431 {
1432         ASSERT(RW_LOCK_HELD(&zvol_state_lock));
1433         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1434                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1435                 struct g_provider *pp = zsg->zsg_provider;
1436
1437                 if (pp->private == NULL) /* already cleared */
1438                         return;
1439
1440                 mtx_lock(&zsg->zsg_queue_mtx);
1441                 zsg->zsg_state = ZVOL_GEOM_STOPPED;
1442                 pp->private = NULL;
1443                 wakeup_one(&zsg->zsg_queue);
1444                 while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
1445                         msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
1446                             0, "zvol:w", 0);
1447                 mtx_unlock(&zsg->zsg_queue_mtx);
1448                 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
1449         } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
1450                 struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
1451                 struct cdev *dev = zsd->zsd_cdev;
1452
1453                 dev->si_drv2 = NULL;
1454         }
1455 }
1456
1457 static int
1458 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
1459 {
1460         zv->zv_volsize = volsize;
1461         if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
1462                 struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
1463                 struct g_provider *pp = zsg->zsg_provider;
1464
1465                 g_topology_lock();
1466
1467                 if (pp->private == NULL) {
1468                         g_topology_unlock();
1469                         return (SET_ERROR(ENXIO));
1470                 }
1471
1472                 /*
1473                  * Do not invoke resize event when initial size was zero.
1474                  * ZVOL initializes the size on first open, this is not
1475                  * real resizing.
1476                  */
1477                 if (pp->mediasize == 0)
1478                         pp->mediasize = zv->zv_volsize;
1479                 else
1480                         g_resize_provider(pp, zv->zv_volsize);
1481
1482                 g_topology_unlock();
1483         }
1484         return (0);
1485 }
1486
1487 static void
1488 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
1489 {
1490         // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
1491 }
1492
1493 static void
1494 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
1495 {
1496         // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
1497 }
1498
1499 const static zvol_platform_ops_t zvol_freebsd_ops = {
1500         .zv_free = zvol_free,
1501         .zv_rename_minor = zvol_rename_minor,
1502         .zv_create_minor = zvol_create_minor_impl,
1503         .zv_update_volsize = zvol_update_volsize,
1504         .zv_clear_private = zvol_clear_private,
1505         .zv_is_zvol = zvol_is_zvol_impl,
1506         .zv_set_disk_ro = zvol_set_disk_ro_impl,
1507         .zv_set_capacity = zvol_set_capacity_impl,
1508 };
1509
1510 /*
1511  * Public interfaces
1512  */
1513
1514 int
1515 zvol_busy(void)
1516 {
1517         return (zvol_minors != 0);
1518 }
1519
1520 int
1521 zvol_init(void)
1522 {
1523         zvol_init_impl();
1524         zvol_register_ops(&zvol_freebsd_ops);
1525         return (0);
1526 }
1527
1528 void
1529 zvol_fini(void)
1530 {
1531         zvol_fini_impl();
1532 }