Fix hangs with processes stuck sleeping on btalloc on i386.
[freebsd.git] / sys / cddl / contrib / opensolaris / uts / common / fs / zfs / dsl_destroy.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
24  * Copyright (c) 2013 Steven Hartland. All rights reserved.
25  * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dsl_userhold.h>
31 #include <sys/dsl_dataset.h>
32 #include <sys/dsl_synctask.h>
33 #include <sys/dsl_destroy.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dsl_pool.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dmu_traverse.h>
38 #include <sys/dsl_scan.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/zap.h>
41 #include <sys/zfeature.h>
42 #include <sys/zfs_ioctl.h>
43 #include <sys/dsl_deleg.h>
44 #include <sys/dmu_impl.h>
45 #include <sys/zcp.h>
46 #if defined(__FreeBSD__) && defined(_KERNEL)
47 #include <sys/zvol.h>
48 #endif
49
50
51 int
52 dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
53 {
54         if (!ds->ds_is_snapshot)
55                 return (SET_ERROR(EINVAL));
56
57         if (dsl_dataset_long_held(ds))
58                 return (SET_ERROR(EBUSY));
59
60         /*
61          * Only allow deferred destroy on pools that support it.
62          * NOTE: deferred destroy is only supported on snapshots.
63          */
64         if (defer) {
65                 if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
66                     SPA_VERSION_USERREFS)
67                         return (SET_ERROR(ENOTSUP));
68                 return (0);
69         }
70
71         /*
72          * If this snapshot has an elevated user reference count,
73          * we can't destroy it yet.
74          */
75         if (ds->ds_userrefs > 0)
76                 return (SET_ERROR(EBUSY));
77
78         /*
79          * Can't delete a branch point.
80          */
81         if (dsl_dataset_phys(ds)->ds_num_children > 1)
82                 return (SET_ERROR(EEXIST));
83
84         return (0);
85 }
86
87 int
88 dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
89 {
90         dsl_destroy_snapshot_arg_t *ddsa = arg;
91         const char *dsname = ddsa->ddsa_name;
92         boolean_t defer = ddsa->ddsa_defer;
93
94         dsl_pool_t *dp = dmu_tx_pool(tx);
95         int error = 0;
96         dsl_dataset_t *ds;
97
98         error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
99
100         /*
101          * If the snapshot does not exist, silently ignore it, and
102          * dsl_destroy_snapshot_sync() will be a no-op
103          * (it's "already destroyed").
104          */
105         if (error == ENOENT)
106                 return (0);
107
108         if (error == 0) {
109                 error = dsl_destroy_snapshot_check_impl(ds, defer);
110                 dsl_dataset_rele(ds, FTAG);
111         }
112
113         return (error);
114 }
115
116 struct process_old_arg {
117         dsl_dataset_t *ds;
118         dsl_dataset_t *ds_prev;
119         boolean_t after_branch_point;
120         zio_t *pio;
121         uint64_t used, comp, uncomp;
122 };
123
124 static int
125 process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
126 {
127         struct process_old_arg *poa = arg;
128         dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
129
130         ASSERT(!BP_IS_HOLE(bp));
131
132         if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
133                 dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
134                 if (poa->ds_prev && !poa->after_branch_point &&
135                     bp->blk_birth >
136                     dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
137                         dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
138                             bp_get_dsize_sync(dp->dp_spa, bp);
139                 }
140         } else {
141                 poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
142                 poa->comp += BP_GET_PSIZE(bp);
143                 poa->uncomp += BP_GET_UCSIZE(bp);
144                 dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
145         }
146         return (0);
147 }
148
149 static void
150 process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
151     dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
152 {
153         struct process_old_arg poa = { 0 };
154         dsl_pool_t *dp = ds->ds_dir->dd_pool;
155         objset_t *mos = dp->dp_meta_objset;
156         uint64_t deadlist_obj;
157
158         ASSERT(ds->ds_deadlist.dl_oldfmt);
159         ASSERT(ds_next->ds_deadlist.dl_oldfmt);
160
161         poa.ds = ds;
162         poa.ds_prev = ds_prev;
163         poa.after_branch_point = after_branch_point;
164         poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
165         VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
166             process_old_cb, &poa, tx));
167         VERIFY0(zio_wait(poa.pio));
168         ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
169
170         /* change snapused */
171         dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
172             -poa.used, -poa.comp, -poa.uncomp, tx);
173
174         /* swap next's deadlist to our deadlist */
175         dsl_deadlist_close(&ds->ds_deadlist);
176         dsl_deadlist_close(&ds_next->ds_deadlist);
177         deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
178         dsl_dataset_phys(ds)->ds_deadlist_obj =
179             dsl_dataset_phys(ds_next)->ds_deadlist_obj;
180         dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
181         dsl_deadlist_open(&ds->ds_deadlist, mos,
182             dsl_dataset_phys(ds)->ds_deadlist_obj);
183         dsl_deadlist_open(&ds_next->ds_deadlist, mos,
184             dsl_dataset_phys(ds_next)->ds_deadlist_obj);
185 }
186
187 static void
188 dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
189 {
190         objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
191         zap_cursor_t zc;
192         zap_attribute_t za;
193
194         /*
195          * If it is the old version, dd_clones doesn't exist so we can't
196          * find the clones, but dsl_deadlist_remove_key() is a no-op so it
197          * doesn't matter.
198          */
199         if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
200                 return;
201
202         for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
203             zap_cursor_retrieve(&zc, &za) == 0;
204             zap_cursor_advance(&zc)) {
205                 dsl_dataset_t *clone;
206
207                 VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
208                     za.za_first_integer, FTAG, &clone));
209                 if (clone->ds_dir->dd_origin_txg > mintxg) {
210                         dsl_deadlist_remove_key(&clone->ds_deadlist,
211                             mintxg, tx);
212                         if (dsl_dataset_remap_deadlist_exists(clone)) {
213                                 dsl_deadlist_remove_key(
214                                     &clone->ds_remap_deadlist, mintxg, tx);
215                         }
216                         dsl_dataset_remove_clones_key(clone, mintxg, tx);
217                 }
218                 dsl_dataset_rele(clone, FTAG);
219         }
220         zap_cursor_fini(&zc);
221 }
222
223 static void
224 dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
225     dmu_tx_t *tx)
226 {
227         dsl_pool_t *dp = ds->ds_dir->dd_pool;
228
229         /* Move blocks to be obsoleted to pool's obsolete list. */
230         if (dsl_dataset_remap_deadlist_exists(ds_next)) {
231                 if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
232                         dsl_pool_create_obsolete_bpobj(dp, tx);
233
234                 dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
235                     &dp->dp_obsolete_bpobj,
236                     dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
237         }
238
239         /* Merge our deadlist into next's and free it. */
240         if (dsl_dataset_remap_deadlist_exists(ds)) {
241                 uint64_t remap_deadlist_object =
242                     dsl_dataset_get_remap_deadlist_object(ds);
243                 ASSERT(remap_deadlist_object != 0);
244
245                 mutex_enter(&ds_next->ds_remap_deadlist_lock);
246                 if (!dsl_dataset_remap_deadlist_exists(ds_next))
247                         dsl_dataset_create_remap_deadlist(ds_next, tx);
248                 mutex_exit(&ds_next->ds_remap_deadlist_lock);
249
250                 dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
251                     remap_deadlist_object, tx);
252                 dsl_dataset_destroy_remap_deadlist(ds, tx);
253         }
254 }
255
256 void
257 dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
258 {
259         int err;
260         int after_branch_point = FALSE;
261         dsl_pool_t *dp = ds->ds_dir->dd_pool;
262         objset_t *mos = dp->dp_meta_objset;
263         dsl_dataset_t *ds_prev = NULL;
264         uint64_t obj;
265
266         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
267         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
268         ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
269         rrw_exit(&ds->ds_bp_rwlock, FTAG);
270         ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
271
272         if (defer &&
273             (ds->ds_userrefs > 0 ||
274             dsl_dataset_phys(ds)->ds_num_children > 1)) {
275                 ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
276                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
277                 dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
278                 spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
279                 return;
280         }
281
282         ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
283
284         /* We need to log before removing it from the namespace. */
285         spa_history_log_internal_ds(ds, "destroy", tx, "");
286
287         dsl_scan_ds_destroyed(ds, tx);
288
289         obj = ds->ds_object;
290
291         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
292                 if (ds->ds_feature_inuse[f]) {
293                         dsl_dataset_deactivate_feature(obj, f, tx);
294                         ds->ds_feature_inuse[f] = B_FALSE;
295                 }
296         }
297         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
298                 ASSERT3P(ds->ds_prev, ==, NULL);
299                 VERIFY0(dsl_dataset_hold_obj(dp,
300                     dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
301                 after_branch_point =
302                     (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
303
304                 dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
305                 if (after_branch_point &&
306                     dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
307                         dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
308                         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
309                                 VERIFY0(zap_add_int(mos,
310                                     dsl_dataset_phys(ds_prev)->
311                                     ds_next_clones_obj,
312                                     dsl_dataset_phys(ds)->ds_next_snap_obj,
313                                     tx));
314                         }
315                 }
316                 if (!after_branch_point) {
317                         dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
318                             dsl_dataset_phys(ds)->ds_next_snap_obj;
319                 }
320         }
321
322         dsl_dataset_t *ds_next;
323         uint64_t old_unique;
324         uint64_t used = 0, comp = 0, uncomp = 0;
325
326         VERIFY0(dsl_dataset_hold_obj(dp,
327             dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
328         ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
329
330         old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
331
332         dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
333         dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
334             dsl_dataset_phys(ds)->ds_prev_snap_obj;
335         dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
336             dsl_dataset_phys(ds)->ds_prev_snap_txg;
337         ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
338             ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
339
340         if (ds_next->ds_deadlist.dl_oldfmt) {
341                 process_old_deadlist(ds, ds_prev, ds_next,
342                     after_branch_point, tx);
343         } else {
344                 /* Adjust prev's unique space. */
345                 if (ds_prev && !after_branch_point) {
346                         dsl_deadlist_space_range(&ds_next->ds_deadlist,
347                             dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
348                             dsl_dataset_phys(ds)->ds_prev_snap_txg,
349                             &used, &comp, &uncomp);
350                         dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
351                 }
352
353                 /* Adjust snapused. */
354                 dsl_deadlist_space_range(&ds_next->ds_deadlist,
355                     dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
356                     &used, &comp, &uncomp);
357                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
358                     -used, -comp, -uncomp, tx);
359
360                 /* Move blocks to be freed to pool's free list. */
361                 dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
362                     &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
363                     tx);
364                 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
365                     DD_USED_HEAD, used, comp, uncomp, tx);
366
367                 /* Merge our deadlist into next's and free it. */
368                 dsl_deadlist_merge(&ds_next->ds_deadlist,
369                     dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
370         }
371
372         dsl_deadlist_close(&ds->ds_deadlist);
373         dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
374         dmu_buf_will_dirty(ds->ds_dbuf, tx);
375         dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
376
377         dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
378
379         /* Collapse range in clone heads */
380         dsl_dataset_remove_clones_key(ds,
381             dsl_dataset_phys(ds)->ds_creation_txg, tx);
382
383         if (ds_next->ds_is_snapshot) {
384                 dsl_dataset_t *ds_nextnext;
385
386                 /*
387                  * Update next's unique to include blocks which
388                  * were previously shared by only this snapshot
389                  * and it.  Those blocks will be born after the
390                  * prev snap and before this snap, and will have
391                  * died after the next snap and before the one
392                  * after that (ie. be on the snap after next's
393                  * deadlist).
394                  */
395                 VERIFY0(dsl_dataset_hold_obj(dp,
396                     dsl_dataset_phys(ds_next)->ds_next_snap_obj,
397                     FTAG, &ds_nextnext));
398                 dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
399                     dsl_dataset_phys(ds)->ds_prev_snap_txg,
400                     dsl_dataset_phys(ds)->ds_creation_txg,
401                     &used, &comp, &uncomp);
402                 dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
403                 dsl_dataset_rele(ds_nextnext, FTAG);
404                 ASSERT3P(ds_next->ds_prev, ==, NULL);
405
406                 /* Collapse range in this head. */
407                 dsl_dataset_t *hds;
408                 VERIFY0(dsl_dataset_hold_obj(dp,
409                     dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
410                 dsl_deadlist_remove_key(&hds->ds_deadlist,
411                     dsl_dataset_phys(ds)->ds_creation_txg, tx);
412                 if (dsl_dataset_remap_deadlist_exists(hds)) {
413                         dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
414                             dsl_dataset_phys(ds)->ds_creation_txg, tx);
415                 }
416                 dsl_dataset_rele(hds, FTAG);
417
418         } else {
419                 ASSERT3P(ds_next->ds_prev, ==, ds);
420                 dsl_dataset_rele(ds_next->ds_prev, ds_next);
421                 ds_next->ds_prev = NULL;
422                 if (ds_prev) {
423                         VERIFY0(dsl_dataset_hold_obj(dp,
424                             dsl_dataset_phys(ds)->ds_prev_snap_obj,
425                             ds_next, &ds_next->ds_prev));
426                 }
427
428                 dsl_dataset_recalc_head_uniq(ds_next);
429
430                 /*
431                  * Reduce the amount of our unconsumed refreservation
432                  * being charged to our parent by the amount of
433                  * new unique data we have gained.
434                  */
435                 if (old_unique < ds_next->ds_reserved) {
436                         int64_t mrsdelta;
437                         uint64_t new_unique =
438                             dsl_dataset_phys(ds_next)->ds_unique_bytes;
439
440                         ASSERT(old_unique <= new_unique);
441                         mrsdelta = MIN(new_unique - old_unique,
442                             ds_next->ds_reserved - old_unique);
443                         dsl_dir_diduse_space(ds->ds_dir,
444                             DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
445                 }
446         }
447         dsl_dataset_rele(ds_next, FTAG);
448
449         /*
450          * This must be done after the dsl_traverse(), because it will
451          * re-open the objset.
452          */
453         if (ds->ds_objset) {
454                 dmu_objset_evict(ds->ds_objset);
455                 ds->ds_objset = NULL;
456         }
457
458         /* remove from snapshot namespace */
459         dsl_dataset_t *ds_head;
460         ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
461         VERIFY0(dsl_dataset_hold_obj(dp,
462             dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
463         VERIFY0(dsl_dataset_get_snapname(ds));
464 #ifdef ZFS_DEBUG
465         {
466                 uint64_t val;
467
468                 err = dsl_dataset_snap_lookup(ds_head,
469                     ds->ds_snapname, &val);
470                 ASSERT0(err);
471                 ASSERT3U(val, ==, obj);
472         }
473 #endif
474         VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
475         dsl_dataset_rele(ds_head, FTAG);
476
477         if (ds_prev != NULL)
478                 dsl_dataset_rele(ds_prev, FTAG);
479
480         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
481
482         if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
483                 uint64_t count;
484                 ASSERT0(zap_count(mos,
485                     dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
486                     count == 0);
487                 VERIFY0(dmu_object_free(mos,
488                     dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
489         }
490         if (dsl_dataset_phys(ds)->ds_props_obj != 0)
491                 VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
492                     tx));
493         if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
494                 VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
495                     tx));
496
497 #if defined(__FreeBSD__) && defined(_KERNEL)
498         char dsname[ZFS_MAX_DATASET_NAME_LEN];
499
500         dsl_dataset_name(ds, dsname);
501         zvol_remove_minors(dp->dp_spa, dsname);
502 #endif
503
504         dsl_dir_rele(ds->ds_dir, ds);
505         ds->ds_dir = NULL;
506         dmu_object_free_zapified(mos, obj, tx);
507 }
508
509 void
510 dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
511 {
512         dsl_destroy_snapshot_arg_t *ddsa = arg;
513         const char *dsname = ddsa->ddsa_name;
514         boolean_t defer = ddsa->ddsa_defer;
515
516         dsl_pool_t *dp = dmu_tx_pool(tx);
517         dsl_dataset_t *ds;
518
519         int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
520         if (error == ENOENT)
521                 return;
522         ASSERT0(error);
523         dsl_destroy_snapshot_sync_impl(ds, defer, tx);
524         dsl_dataset_rele(ds, FTAG);
525 }
526
527 /*
528  * The semantics of this function are described in the comment above
529  * lzc_destroy_snaps().  To summarize:
530  *
531  * The snapshots must all be in the same pool.
532  *
533  * Snapshots that don't exist will be silently ignored (considered to be
534  * "already deleted").
535  *
536  * On success, all snaps will be destroyed and this will return 0.
537  * On failure, no snaps will be destroyed, the errlist will be filled in,
538  * and this will return an errno.
539  */
540 int
541 dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
542     nvlist_t *errlist)
543 {
544         if (nvlist_next_nvpair(snaps, NULL) == NULL)
545                 return (0);
546
547         /*
548          * lzc_destroy_snaps() is documented to take an nvlist whose
549          * values "don't matter".  We need to convert that nvlist to
550          * one that we know can be converted to LUA. We also don't
551          * care about any duplicate entries because the nvlist will
552          * be converted to a LUA table which should take care of this.
553          */
554         nvlist_t *snaps_normalized;
555         VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP));
556         for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
557             pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
558                 fnvlist_add_boolean_value(snaps_normalized,
559                     nvpair_name(pair), B_TRUE);
560         }
561
562         nvlist_t *arg;
563         VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP));
564         fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
565         fnvlist_free(snaps_normalized);
566         fnvlist_add_boolean_value(arg, "defer", defer);
567
568         nvlist_t *wrapper;
569         VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP));
570         fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
571         fnvlist_free(arg);
572
573         const char *program =
574             "arg = ...\n"
575             "snaps = arg['snaps']\n"
576             "defer = arg['defer']\n"
577             "errors = { }\n"
578             "has_errors = false\n"
579             "for snap, v in pairs(snaps) do\n"
580             "    errno = zfs.check.destroy{snap, defer=defer}\n"
581             "    zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
582             "    if errno == ENOENT then\n"
583             "        snaps[snap] = nil\n"
584             "    elseif errno ~= 0 then\n"
585             "        errors[snap] = errno\n"
586             "        has_errors = true\n"
587             "    end\n"
588             "end\n"
589             "if has_errors then\n"
590             "    return errors\n"
591             "end\n"
592             "for snap, v in pairs(snaps) do\n"
593             "    errno = zfs.sync.destroy{snap, defer=defer}\n"
594             "    assert(errno == 0)\n"
595             "end\n"
596             "return { }\n";
597
598         nvlist_t *result = fnvlist_alloc();
599         int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
600             program,
601             B_TRUE,
602             0,
603             zfs_lua_max_memlimit,
604             nvlist_next_nvpair(wrapper, NULL), result);
605         if (error != 0) {
606                 char *errorstr = NULL;
607                 (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
608                 if (errorstr != NULL) {
609                         zfs_dbgmsg(errorstr);
610                 }
611                 return (error);
612         }
613         fnvlist_free(wrapper);
614
615         /*
616          * lzc_destroy_snaps() is documented to fill the errlist with
617          * int32 values, so we need to covert the int64 values that are
618          * returned from LUA.
619          */
620         int rv = 0;
621         nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
622         for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
623             pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
624                 int32_t val = (int32_t)fnvpair_value_int64(pair);
625                 if (rv == 0)
626                         rv = val;
627                 fnvlist_add_int32(errlist, nvpair_name(pair), val);
628         }
629         fnvlist_free(result);
630         return (rv);
631 }
632
633 int
634 dsl_destroy_snapshot(const char *name, boolean_t defer)
635 {
636         int error;
637         nvlist_t *nvl = fnvlist_alloc();
638         nvlist_t *errlist = fnvlist_alloc();
639
640         fnvlist_add_boolean(nvl, name);
641         error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
642         fnvlist_free(errlist);
643         fnvlist_free(nvl);
644         return (error);
645 }
646
647 struct killarg {
648         dsl_dataset_t *ds;
649         dmu_tx_t *tx;
650 };
651
652 /* ARGSUSED */
653 static int
654 kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
655     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
656 {
657         struct killarg *ka = arg;
658         dmu_tx_t *tx = ka->tx;
659
660         if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
661                 return (0);
662
663         if (zb->zb_level == ZB_ZIL_LEVEL) {
664                 ASSERT(zilog != NULL);
665                 /*
666                  * It's a block in the intent log.  It has no
667                  * accounting, so just free it.
668                  */
669                 dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
670         } else {
671                 ASSERT(zilog == NULL);
672                 ASSERT3U(bp->blk_birth, >,
673                     dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
674                 (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
675         }
676
677         return (0);
678 }
679
680 static void
681 old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
682 {
683         struct killarg ka;
684
685         /*
686          * Free everything that we point to (that's born after
687          * the previous snapshot, if we are a clone)
688          *
689          * NB: this should be very quick, because we already
690          * freed all the objects in open context.
691          */
692         ka.ds = ds;
693         ka.tx = tx;
694         VERIFY0(traverse_dataset(ds,
695             dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
696             kill_blkptr, &ka));
697         ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
698             dsl_dataset_phys(ds)->ds_unique_bytes == 0);
699 }
700
701 int
702 dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
703 {
704         int error;
705         uint64_t count;
706         objset_t *mos;
707
708         ASSERT(!ds->ds_is_snapshot);
709         if (ds->ds_is_snapshot)
710                 return (SET_ERROR(EINVAL));
711
712         if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
713                 return (SET_ERROR(EBUSY));
714
715         mos = ds->ds_dir->dd_pool->dp_meta_objset;
716
717         /*
718          * Can't delete a head dataset if there are snapshots of it.
719          * (Except if the only snapshots are from the branch we cloned
720          * from.)
721          */
722         if (ds->ds_prev != NULL &&
723             dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
724                 return (SET_ERROR(EBUSY));
725
726         /*
727          * Can't delete if there are children of this fs.
728          */
729         error = zap_count(mos,
730             dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
731         if (error != 0)
732                 return (error);
733         if (count != 0)
734                 return (SET_ERROR(EEXIST));
735
736         if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
737             dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
738             ds->ds_prev->ds_userrefs == 0) {
739                 /* We need to remove the origin snapshot as well. */
740                 if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
741                         return (SET_ERROR(EBUSY));
742         }
743         return (0);
744 }
745
746 int
747 dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
748 {
749         dsl_destroy_head_arg_t *ddha = arg;
750         dsl_pool_t *dp = dmu_tx_pool(tx);
751         dsl_dataset_t *ds;
752         int error;
753
754         error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
755         if (error != 0)
756                 return (error);
757
758         error = dsl_destroy_head_check_impl(ds, 0);
759         dsl_dataset_rele(ds, FTAG);
760         return (error);
761 }
762
763 static void
764 dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
765 {
766         dsl_dir_t *dd;
767         dsl_pool_t *dp = dmu_tx_pool(tx);
768         objset_t *mos = dp->dp_meta_objset;
769         dd_used_t t;
770
771         ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
772
773         VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
774
775         ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
776
777         /*
778          * Decrement the filesystem count for all parent filesystems.
779          *
780          * When we receive an incremental stream into a filesystem that already
781          * exists, a temporary clone is created.  We never count this temporary
782          * clone, whose name begins with a '%'.
783          */
784         if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
785                 dsl_fs_ss_count_adjust(dd->dd_parent, -1,
786                     DD_FIELD_FILESYSTEM_COUNT, tx);
787
788         /*
789          * Remove our reservation. The impl() routine avoids setting the
790          * actual property, which would require the (already destroyed) ds.
791          */
792         dsl_dir_set_reservation_sync_impl(dd, 0, tx);
793
794         ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
795         ASSERT0(dsl_dir_phys(dd)->dd_reserved);
796         for (t = 0; t < DD_USED_NUM; t++)
797                 ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
798
799         VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
800         VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
801         if (dsl_dir_phys(dd)->dd_clones != 0)
802                 VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
803         VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
804         VERIFY0(zap_remove(mos,
805             dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
806             dd->dd_myname, tx));
807
808         dsl_dir_rele(dd, FTAG);
809         dmu_object_free_zapified(mos, ddobj, tx);
810 }
811
812 void
813 dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
814 {
815         dsl_pool_t *dp = dmu_tx_pool(tx);
816         objset_t *mos = dp->dp_meta_objset;
817         uint64_t obj, ddobj, prevobj = 0;
818         boolean_t rmorigin;
819
820         ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
821         ASSERT(ds->ds_prev == NULL ||
822             dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
823         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
824         ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
825         rrw_exit(&ds->ds_bp_rwlock, FTAG);
826         ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
827
828         /* We need to log before removing it from the namespace. */
829         spa_history_log_internal_ds(ds, "destroy", tx, "");
830
831         rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
832             DS_IS_DEFER_DESTROY(ds->ds_prev) &&
833             dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
834             ds->ds_prev->ds_userrefs == 0);
835
836         /* Remove our reservation. */
837         if (ds->ds_reserved != 0) {
838                 dsl_dataset_set_refreservation_sync_impl(ds,
839                     (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
840                     0, tx);
841                 ASSERT0(ds->ds_reserved);
842         }
843
844         obj = ds->ds_object;
845
846         for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
847                 if (ds->ds_feature_inuse[f]) {
848                         dsl_dataset_deactivate_feature(obj, f, tx);
849                         ds->ds_feature_inuse[f] = B_FALSE;
850                 }
851         }
852
853         dsl_scan_ds_destroyed(ds, tx);
854
855         if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
856                 /* This is a clone */
857                 ASSERT(ds->ds_prev != NULL);
858                 ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
859                     obj);
860                 ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
861
862                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
863                 if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
864                         dsl_dataset_remove_from_next_clones(ds->ds_prev,
865                             obj, tx);
866                 }
867
868                 ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
869                 dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
870         }
871
872         /*
873          * Destroy the deadlist.  Unless it's a clone, the
874          * deadlist should be empty since the dataset has no snapshots.
875          * (If it's a clone, it's safe to ignore the deadlist contents
876          * since they are still referenced by the origin snapshot.)
877          */
878         dsl_deadlist_close(&ds->ds_deadlist);
879         dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
880         dmu_buf_will_dirty(ds->ds_dbuf, tx);
881         dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
882
883         if (dsl_dataset_remap_deadlist_exists(ds))
884                 dsl_dataset_destroy_remap_deadlist(ds, tx);
885
886         objset_t *os;
887         VERIFY0(dmu_objset_from_ds(ds, &os));
888
889         if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
890                 old_synchronous_dataset_destroy(ds, tx);
891         } else {
892                 /*
893                  * Move the bptree into the pool's list of trees to
894                  * clean up and update space accounting information.
895                  */
896                 uint64_t used, comp, uncomp;
897
898                 zil_destroy_sync(dmu_objset_zil(os), tx);
899
900                 if (!spa_feature_is_active(dp->dp_spa,
901                     SPA_FEATURE_ASYNC_DESTROY)) {
902                         dsl_scan_t *scn = dp->dp_scan;
903                         spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
904                             tx);
905                         dp->dp_bptree_obj = bptree_alloc(mos, tx);
906                         VERIFY0(zap_add(mos,
907                             DMU_POOL_DIRECTORY_OBJECT,
908                             DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
909                             &dp->dp_bptree_obj, tx));
910                         ASSERT(!scn->scn_async_destroying);
911                         scn->scn_async_destroying = B_TRUE;
912                 }
913
914                 used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
915                 comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
916                 uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
917
918                 ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
919                     dsl_dataset_phys(ds)->ds_unique_bytes == used);
920
921                 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
922                 bptree_add(mos, dp->dp_bptree_obj,
923                     &dsl_dataset_phys(ds)->ds_bp,
924                     dsl_dataset_phys(ds)->ds_prev_snap_txg,
925                     used, comp, uncomp, tx);
926                 rrw_exit(&ds->ds_bp_rwlock, FTAG);
927                 dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
928                     -used, -comp, -uncomp, tx);
929                 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
930                     used, comp, uncomp, tx);
931         }
932
933         if (ds->ds_prev != NULL) {
934                 if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
935                         VERIFY0(zap_remove_int(mos,
936                             dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
937                             ds->ds_object, tx));
938                 }
939                 prevobj = ds->ds_prev->ds_object;
940                 dsl_dataset_rele(ds->ds_prev, ds);
941                 ds->ds_prev = NULL;
942         }
943
944         /*
945          * This must be done after the dsl_traverse(), because it will
946          * re-open the objset.
947          */
948         if (ds->ds_objset) {
949                 dmu_objset_evict(ds->ds_objset);
950                 ds->ds_objset = NULL;
951         }
952
953         /* Erase the link in the dir */
954         dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
955         dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
956         ddobj = ds->ds_dir->dd_object;
957         ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
958         VERIFY0(zap_destroy(mos,
959             dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
960
961         if (ds->ds_bookmarks != 0) {
962                 VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
963                 spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
964         }
965
966         spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
967
968         ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
969         ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
970         ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
971         dsl_dir_rele(ds->ds_dir, ds);
972         ds->ds_dir = NULL;
973         dmu_object_free_zapified(mos, obj, tx);
974
975         dsl_dir_destroy_sync(ddobj, tx);
976
977         if (rmorigin) {
978                 dsl_dataset_t *prev;
979                 VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
980                 dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
981                 dsl_dataset_rele(prev, FTAG);
982         }
983 }
984
985 void
986 dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
987 {
988         dsl_destroy_head_arg_t *ddha = arg;
989         dsl_pool_t *dp = dmu_tx_pool(tx);
990         dsl_dataset_t *ds;
991
992         VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
993         dsl_destroy_head_sync_impl(ds, tx);
994 #if defined(__FreeBSD__) && defined(_KERNEL)
995         zvol_remove_minors(dp->dp_spa, ddha->ddha_name);
996 #endif
997         dsl_dataset_rele(ds, FTAG);
998 }
999
1000 static void
1001 dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
1002 {
1003         dsl_destroy_head_arg_t *ddha = arg;
1004         dsl_pool_t *dp = dmu_tx_pool(tx);
1005         dsl_dataset_t *ds;
1006
1007         VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
1008
1009         /* Mark it as inconsistent on-disk, in case we crash */
1010         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1011         dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
1012
1013         spa_history_log_internal_ds(ds, "destroy begin", tx, "");
1014         dsl_dataset_rele(ds, FTAG);
1015 }
1016
1017 int
1018 dsl_destroy_head(const char *name)
1019 {
1020         dsl_destroy_head_arg_t ddha;
1021         int error;
1022         spa_t *spa;
1023         boolean_t isenabled;
1024
1025 #ifdef _KERNEL
1026         zfs_destroy_unmount_origin(name);
1027 #endif
1028
1029         error = spa_open(name, &spa, FTAG);
1030         if (error != 0)
1031                 return (error);
1032         isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
1033         spa_close(spa, FTAG);
1034
1035         ddha.ddha_name = name;
1036
1037         if (!isenabled) {
1038                 objset_t *os;
1039
1040                 error = dsl_sync_task(name, dsl_destroy_head_check,
1041                     dsl_destroy_head_begin_sync, &ddha,
1042                     0, ZFS_SPACE_CHECK_DESTROY);
1043                 if (error != 0)
1044                         return (error);
1045
1046                 /*
1047                  * Head deletion is processed in one txg on old pools;
1048                  * remove the objects from open context so that the txg sync
1049                  * is not too long.
1050                  */
1051                 error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
1052                 if (error == 0) {
1053                         uint64_t prev_snap_txg =
1054                             dsl_dataset_phys(dmu_objset_ds(os))->
1055                             ds_prev_snap_txg;
1056                         for (uint64_t obj = 0; error == 0;
1057                             error = dmu_object_next(os, &obj, FALSE,
1058                             prev_snap_txg))
1059                                 (void) dmu_free_long_object(os, obj);
1060                         /* sync out all frees */
1061                         txg_wait_synced(dmu_objset_pool(os), 0);
1062                         dmu_objset_disown(os, FTAG);
1063                 }
1064         }
1065
1066         return (dsl_sync_task(name, dsl_destroy_head_check,
1067             dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
1068 }
1069
1070 /*
1071  * Note, this function is used as the callback for dmu_objset_find().  We
1072  * always return 0 so that we will continue to find and process
1073  * inconsistent datasets, even if we encounter an error trying to
1074  * process one of them.
1075  */
1076 /* ARGSUSED */
1077 int
1078 dsl_destroy_inconsistent(const char *dsname, void *arg)
1079 {
1080         objset_t *os;
1081
1082         if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
1083                 boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
1084
1085                 /*
1086                  * If the dataset is inconsistent because a resumable receive
1087                  * has failed, then do not destroy it.
1088                  */
1089                 if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
1090                         need_destroy = B_FALSE;
1091
1092                 dmu_objset_rele(os, FTAG);
1093                 if (need_destroy)
1094                         (void) dsl_destroy_head(dsname);
1095         }
1096         return (0);
1097 }